// This file is part of AsmJit project <https://asmjit.com>
//
// See asmjit.h or LICENSE.md for license and copyright information
// SPDX-License-Identifier: Zlib

#include <asmjit/core.h>

#if !defined(ASMJIT_NO_X86)
#include <asmjit/x86.h>

#include <limits>
#include <stdio.h>
#include <string.h>

#include "asmjit_test_misc.h"
#include "asmjit_test_perf.h"

using namespace asmjit;

enum class InstForm {
  kReg,
  kMem
};

// Generates a long sequence of GP instructions.
template<typename Emitter>
static void generateGpSequenceInternal(
  Emitter& cc,
  InstForm form,
  const x86::Gp& a, const x86::Gp& b, const x86::Gp& c, const x86::Gp& d) {

  cc.mov(a, 0xAAAAAAAA);
  cc.mov(b, 0xBBBBBBBB);
  cc.mov(c, 0xCCCCCCCC);
  cc.mov(d, 0xFFFFFFFF);

  if (form == InstForm::kReg) {
    cc.adc(a, b);
    cc.adc(b, c);
    cc.adc(c, d);
    cc.add(a, b);
    cc.add(b, c);
    cc.add(c, d);
    cc.and_(a, b);
    cc.and_(b, c);
    cc.and_(c, d);
    cc.bsf(a, b);
    cc.bsf(b, c);
    cc.bsf(c, d);
    cc.bsr(a, b);
    cc.bsr(b, c);
    cc.bsr(c, d);
    cc.bswap(a);
    cc.bswap(b);
    cc.bswap(c);
    cc.bt(a, b);
    cc.bt(b, c);
    cc.bt(c, d);
    cc.btc(a, b);
    cc.btc(b, c);
    cc.btc(c, d);
    cc.btr(a, b);
    cc.btr(b, c);
    cc.btr(c, d);
    cc.bts(a, b);
    cc.bts(b, c);
    cc.bts(c, d);
    cc.cmp(a, b);
    cc.cmovc(a, b);
    cc.cmp(b, c);
    cc.cmovc(b, c);
    cc.cmp(c, d);
    cc.cmovc(c, d);
    cc.dec(a);
    cc.dec(b);
    cc.dec(c);
    cc.imul(a, b);
    cc.imul(b, c);
    cc.imul(c, d);
    cc.movsx(a, b.r8Lo());
    cc.movsx(b, c.r8Lo());
    cc.movsx(c, d.r8Lo());
    cc.movzx(a, b.r8Lo());
    cc.movzx(b, c.r8Lo());
    cc.movzx(c, d.r8Lo());
    cc.neg(a);
    cc.neg(b);
    cc.neg(c);
    cc.not_(a);
    cc.not_(b);
    cc.not_(c);
    cc.or_(a, b);
    cc.or_(b, c);
    cc.or_(c, d);
    cc.sbb(a, b);
    cc.sbb(b, c);
    cc.sbb(c, d);
    cc.sub(a, b);
    cc.sub(b, c);
    cc.sub(c, d);
    cc.test(a, b);
    cc.test(b, c);
    cc.test(c, d);
    cc.xchg(a, b);
    cc.xchg(b, c);
    cc.xchg(c, d);
    cc.xor_(a, b);
    cc.xor_(b, c);
    cc.xor_(c, d);

    cc.rcl(a, c.r8Lo());
    cc.rcl(b, c.r8Lo());
    cc.rcl(d, c.r8Lo());
    cc.rcr(a, c.r8Lo());
    cc.rcr(b, c.r8Lo());
    cc.rcr(d, c.r8Lo());
    cc.rol(a, c.r8Lo());
    cc.rol(b, c.r8Lo());
    cc.rol(d, c.r8Lo());
    cc.ror(a, c.r8Lo());
    cc.ror(b, c.r8Lo());
    cc.ror(d, c.r8Lo());
    cc.shl(a, c.r8Lo());
    cc.shl(b, c.r8Lo());
    cc.shl(d, c.r8Lo());
    cc.shr(a, c.r8Lo());
    cc.shr(b, c.r8Lo());
    cc.shr(d, c.r8Lo());
    cc.sar(a, c.r8Lo());
    cc.sar(b, c.r8Lo());
    cc.sar(d, c.r8Lo());
    cc.shld(a, b, c.r8Lo());
    cc.shld(b, d, c.r8Lo());
    cc.shld(d, a, c.r8Lo());
    cc.shrd(a, b, c.r8Lo());
    cc.shrd(b, d, c.r8Lo());
    cc.shrd(d, a, c.r8Lo());

    cc.adcx(a, b);
    cc.adox(a, b);
    cc.adcx(b, c);
    cc.adox(b, c);
    cc.adcx(c, d);
    cc.adox(c, d);
    cc.andn(a, b, c);
    cc.andn(b, c, d);
    cc.andn(c, d, a);
    cc.bextr(a, b, c);
    cc.bextr(b, c, d);
    cc.bextr(c, d, a);
    cc.blsi(a, b);
    cc.blsi(b, c);
    cc.blsi(c, d);
    cc.blsmsk(a, b);
    cc.blsmsk(b, c);
    cc.blsmsk(c, d);
    cc.blsr(a, b);
    cc.blsr(b, c);
    cc.blsr(c, d);
    cc.bzhi(a, b, c);
    cc.bzhi(b, c, d);
    cc.bzhi(c, d, a);
    cc.lzcnt(a, b);
    cc.lzcnt(b, c);
    cc.lzcnt(c, d);
    cc.pdep(a, b, c);
    cc.pdep(b, c, d);
    cc.pdep(c, d, a);
    cc.pext(a, b, c);
    cc.pext(b, c, d);
    cc.pext(c, d, a);
    cc.popcnt(a, b);
    cc.popcnt(b, c);
    cc.popcnt(c, d);
    cc.rorx(a, b, 8);
    cc.rorx(b, c, 8);
    cc.rorx(c, d, 8);
    cc.sarx(a, b, c);
    cc.sarx(b, c, d);
    cc.sarx(c, d, a);
    cc.shlx(a, b, c);
    cc.shlx(b, c, d);
    cc.shlx(c, d, a);
    cc.shrx(a, b, c);
    cc.shrx(b, c, d);
    cc.shrx(c, d, a);
    cc.tzcnt(a, b);
    cc.tzcnt(b, c);
    cc.tzcnt(c, d);
  }
  else {
    uint32_t regSize = cc.registerSize();
    x86::Mem m = x86::ptr(c, 0, regSize);
    x86::Mem m8 = x86::byte_ptr(c);

    cc.adc(a, m);
    cc.adc(b, m);
    cc.adc(c, m);
    cc.add(a, m);
    cc.add(b, m);
    cc.add(c, m);
    cc.and_(a, m);
    cc.and_(b, m);
    cc.and_(c, m);
    cc.bsf(a, m);
    cc.bsf(b, m);
    cc.bsf(c, m);
    cc.bsr(a, m);
    cc.bsr(b, m);
    cc.bsr(c, m);
    cc.bt(m, a);
    cc.bt(m, b);
    cc.bt(m, c);
    cc.btc(m, a);
    cc.btc(m, b);
    cc.btc(m, c);
    cc.btr(m, a);
    cc.btr(m, b);
    cc.btr(m, c);
    cc.bts(m, a);
    cc.bts(m, b);
    cc.bts(m, c);
    cc.cmp(a, m);
    cc.cmovc(a, m);
    cc.cmp(b, m);
    cc.cmovc(b, m);
    cc.cmp(c, m);
    cc.cmovc(c, m);
    cc.dec(m);
    cc.movsx(a, m8);
    cc.movsx(b, m8);
    cc.movsx(c, m8);
    cc.movzx(a, m8);
    cc.movzx(b, m8);
    cc.movzx(c, m8);
    cc.neg(m);
    cc.not_(m);
    cc.or_(a, m);
    cc.or_(b, m);
    cc.or_(c, m);
    cc.sbb(a, m);
    cc.sbb(b, m);
    cc.sbb(c, m);
    cc.sub(a, m);
    cc.sub(b, m);
    cc.sub(c, m);
    cc.test(m, a);
    cc.test(m, b);
    cc.test(m, c);
    cc.xchg(a, m);
    cc.xchg(b, m);
    cc.xchg(c, m);
    cc.xor_(a, m);
    cc.xor_(b, m);
    cc.xor_(c, m);

    cc.rcl(m, c.r8Lo());
    cc.rcr(m, c.r8Lo());
    cc.rol(m, c.r8Lo());
    cc.ror(m, c.r8Lo());
    cc.shl(m, c.r8Lo());
    cc.shr(m, c.r8Lo());
    cc.sar(m, c.r8Lo());
    cc.shld(m, b, c.r8Lo());
    cc.shld(m, d, c.r8Lo());
    cc.shld(m, a, c.r8Lo());
    cc.shrd(m, b, c.r8Lo());
    cc.shrd(m, d, c.r8Lo());
    cc.shrd(m, a, c.r8Lo());

    cc.adcx(a, m);
    cc.adox(a, m);
    cc.adcx(b, m);
    cc.adox(b, m);
    cc.adcx(c, m);
    cc.adox(c, m);
    cc.andn(a, b, m);
    cc.andn(b, c, m);
    cc.andn(c, d, m);
    cc.bextr(a, m, c);
    cc.bextr(b, m, d);
    cc.bextr(c, m, a);
    cc.blsi(a, m);
    cc.blsi(b, m);
    cc.blsi(c, m);
    cc.blsmsk(a, m);
    cc.blsmsk(b, m);
    cc.blsmsk(c, m);
    cc.blsr(a, m);
    cc.blsr(b, m);
    cc.blsr(c, m);
    cc.bzhi(a, m, c);
    cc.bzhi(b, m, d);
    cc.bzhi(c, m, a);
    cc.lzcnt(a, m);
    cc.lzcnt(b, m);
    cc.lzcnt(c, m);
    cc.pdep(a, b, m);
    cc.pdep(b, c, m);
    cc.pdep(c, d, m);
    cc.pext(a, b, m);
    cc.pext(b, c, m);
    cc.pext(c, d, m);
    cc.popcnt(a, m);
    cc.popcnt(b, m);
    cc.popcnt(c, m);
    cc.rorx(a, m, 8);
    cc.rorx(b, m, 8);
    cc.rorx(c, m, 8);
    cc.sarx(a, m, c);
    cc.sarx(b, m, d);
    cc.sarx(c, m, a);
    cc.shlx(a, m, c);
    cc.shlx(b, m, d);
    cc.shlx(c, m, a);
    cc.shrx(a, m, c);
    cc.shrx(b, m, d);
    cc.shrx(c, m, a);
    cc.tzcnt(a, m);
    cc.tzcnt(b, m);
    cc.tzcnt(c, m);
  }
}

static void generateGpSequence(BaseEmitter& emitter, InstForm form, bool emitPrologEpilog) {
  using namespace asmjit::x86;

  if (emitter.isAssembler()) {
    Assembler& cc = *emitter.as<Assembler>();

    x86::Gp a = cc.zax();
    x86::Gp b = cc.zbx();
    x86::Gp c = cc.zcx();
    x86::Gp d = cc.zdx();

    if (emitPrologEpilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.addDirtyRegs(a, b, c, d);
      frame.finalize();

      cc.emitProlog(frame);
      generateGpSequenceInternal(cc, form, a, b, c, d);
      cc.emitEpilog(frame);
    }
    else {
      generateGpSequenceInternal(cc, form, a, b, c, d);
    }
  }
#ifndef ASMJIT_NO_BUILDER
  else if (emitter.isBuilder()) {
    Builder& cc = *emitter.as<Builder>();

    x86::Gp a = cc.zax();
    x86::Gp b = cc.zbx();
    x86::Gp c = cc.zcx();
    x86::Gp d = cc.zdx();

    if (emitPrologEpilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.addDirtyRegs(a, b, c, d);
      frame.finalize();

      cc.emitProlog(frame);
      generateGpSequenceInternal(cc, form, a, b, c, d);
      cc.emitEpilog(frame);
    }
    else {
      generateGpSequenceInternal(cc, form, a, b, c, d);
    }
  }
#endif
#ifndef ASMJIT_NO_COMPILER
  else if (emitter.isCompiler()) {
    Compiler& cc = *emitter.as<Compiler>();

    Gp a = cc.newIntPtr("a");
    Gp b = cc.newIntPtr("b");
    Gp c = cc.newIntPtr("c");
    Gp d = cc.newIntPtr("d");

    cc.addFunc(FuncSignature::build<void>());
    generateGpSequenceInternal(cc, form, a, b, c, d);
    cc.endFunc();
  }
#endif
}

// Generates a long sequence of SSE instructions using only registers.
template<typename Emitter>
static void generateSseSequenceInternal(
  Emitter& cc,
  InstForm form,
  const x86::Gp& gp,
  const x86::Xmm& xmmA, const x86::Xmm& xmmB, const x86::Xmm& xmmC, const x86::Xmm& xmmD) {

  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is32Bit() ? gpd : gpq;

  cc.xor_(gpd, gpd);
  cc.xorps(xmmA, xmmA);
  cc.xorps(xmmB, xmmB);
  cc.xorps(xmmC, xmmC);
  cc.xorps(xmmD, xmmD);

  if (form == InstForm::kReg) {
    // SSE.
    cc.addps(xmmA, xmmB);
    cc.addss(xmmA, xmmB);
    cc.andnps(xmmA, xmmB);
    cc.andps(xmmA, xmmB);
    cc.cmpps(xmmA, xmmB, 0);
    cc.cmpss(xmmA, xmmB, 0);
    cc.comiss(xmmA, xmmB);
    cc.cvtsi2ss(xmmA, gpd);
    cc.cvtsi2ss(xmmA, gpz);
    cc.cvtss2si(gpd, xmmB);
    cc.cvtss2si(gpz, xmmB);
    cc.cvttss2si(gpd, xmmB);
    cc.cvttss2si(gpz, xmmB);
    cc.divps(xmmA, xmmB);
    cc.divss(xmmA, xmmB);
    cc.maxps(xmmA, xmmB);
    cc.maxss(xmmA, xmmB);
    cc.minps(xmmA, xmmB);
    cc.minss(xmmA, xmmB);
    cc.movaps(xmmA, xmmB);
    cc.movd(gpd, xmmB);
    cc.movd(xmmA, gpd);
    cc.movq(xmmA, xmmB);
    cc.movhlps(xmmA, xmmB);
    cc.movlhps(xmmA, xmmB);
    cc.movups(xmmA, xmmB);
    cc.mulps(xmmA, xmmB);
    cc.mulss(xmmA, xmmB);
    cc.orps(xmmA, xmmB);
    cc.rcpps(xmmA, xmmB);
    cc.rcpss(xmmA, xmmB);
    cc.psadbw(xmmA, xmmB);
    cc.rsqrtps(xmmA, xmmB);
    cc.rsqrtss(xmmA, xmmB);
    cc.sfence();
    cc.shufps(xmmA, xmmB, 0);
    cc.sqrtps(xmmA, xmmB);
    cc.sqrtss(xmmA, xmmB);
    cc.subps(xmmA, xmmB);
    cc.subss(xmmA, xmmB);
    cc.ucomiss(xmmA, xmmB);
    cc.unpckhps(xmmA, xmmB);
    cc.unpcklps(xmmA, xmmB);
    cc.xorps(xmmA, xmmB);

    // SSE2.
    cc.addpd(xmmA, xmmB);
    cc.addsd(xmmA, xmmB);
    cc.andnpd(xmmA, xmmB);
    cc.andpd(xmmA, xmmB);
    cc.cmppd(xmmA, xmmB, 0);
    cc.cmpsd(xmmA, xmmB, 0);
    cc.comisd(xmmA, xmmB);
    cc.cvtdq2pd(xmmA, xmmB);
    cc.cvtdq2ps(xmmA, xmmB);
    cc.cvtpd2dq(xmmA, xmmB);
    cc.cvtpd2ps(xmmA, xmmB);
    cc.cvtps2dq(xmmA, xmmB);
    cc.cvtps2pd(xmmA, xmmB);
    cc.cvtsd2si(gpd, xmmB);
    cc.cvtsd2si(gpz, xmmB);
    cc.cvtsd2ss(xmmA, xmmB);
    cc.cvtsi2sd(xmmA, gpd);
    cc.cvtsi2sd(xmmA, gpz);
    cc.cvtss2sd(xmmA, xmmB);
    cc.cvtss2si(gpd, xmmB);
    cc.cvtss2si(gpz, xmmB);
    cc.cvttpd2dq(xmmA, xmmB);
    cc.cvttps2dq(xmmA, xmmB);
    cc.cvttsd2si(gpd, xmmB);
    cc.cvttsd2si(gpz, xmmB);
    cc.divpd(xmmA, xmmB);
    cc.divsd(xmmA, xmmB);
    cc.maxpd(xmmA, xmmB);
    cc.maxsd(xmmA, xmmB);
    cc.minpd(xmmA, xmmB);
    cc.minsd(xmmA, xmmB);
    cc.movdqa(xmmA, xmmB);
    cc.movdqu(xmmA, xmmB);
    cc.movmskps(gpd, xmmB);
    cc.movmskpd(gpd, xmmB);
    cc.movsd(xmmA, xmmB);
    cc.mulpd(xmmA, xmmB);
    cc.mulsd(xmmA, xmmB);
    cc.orpd(xmmA, xmmB);
    cc.packsswb(xmmA, xmmB);
    cc.packssdw(xmmA, xmmB);
    cc.packuswb(xmmA, xmmB);
    cc.paddb(xmmA, xmmB);
    cc.paddw(xmmA, xmmB);
    cc.paddd(xmmA, xmmB);
    cc.paddq(xmmA, xmmB);
    cc.paddsb(xmmA, xmmB);
    cc.paddsw(xmmA, xmmB);
    cc.paddusb(xmmA, xmmB);
    cc.paddusw(xmmA, xmmB);
    cc.pand(xmmA, xmmB);
    cc.pandn(xmmA, xmmB);
    cc.pavgb(xmmA, xmmB);
    cc.pavgw(xmmA, xmmB);
    cc.pcmpeqb(xmmA, xmmB);
    cc.pcmpeqw(xmmA, xmmB);
    cc.pcmpeqd(xmmA, xmmB);
    cc.pcmpgtb(xmmA, xmmB);
    cc.pcmpgtw(xmmA, xmmB);
    cc.pcmpgtd(xmmA, xmmB);
    cc.pmaxsw(xmmA, xmmB);
    cc.pmaxub(xmmA, xmmB);
    cc.pminsw(xmmA, xmmB);
    cc.pminub(xmmA, xmmB);
    cc.pmovmskb(gpd, xmmB);
    cc.pmulhw(xmmA, xmmB);
    cc.pmulhuw(xmmA, xmmB);
    cc.pmullw(xmmA, xmmB);
    cc.pmuludq(xmmA, xmmB);
    cc.por(xmmA, xmmB);
    cc.pslld(xmmA, xmmB);
    cc.pslld(xmmA, 0);
    cc.psllq(xmmA, xmmB);
    cc.psllq(xmmA, 0);
    cc.psllw(xmmA, xmmB);
    cc.psllw(xmmA, 0);
    cc.pslldq(xmmA, 0);
    cc.psrad(xmmA, xmmB);
    cc.psrad(xmmA, 0);
    cc.psraw(xmmA, xmmB);
    cc.psraw(xmmA, 0);
    cc.psubb(xmmA, xmmB);
    cc.psubw(xmmA, xmmB);
    cc.psubd(xmmA, xmmB);
    cc.psubq(xmmA, xmmB);
    cc.pmaddwd(xmmA, xmmB);
    cc.pshufd(xmmA, xmmB, 0);
    cc.pshufhw(xmmA, xmmB, 0);
    cc.pshuflw(xmmA, xmmB, 0);
    cc.psrld(xmmA, xmmB);
    cc.psrld(xmmA, 0);
    cc.psrlq(xmmA, xmmB);
    cc.psrlq(xmmA, 0);
    cc.psrldq(xmmA, 0);
    cc.psrlw(xmmA, xmmB);
    cc.psrlw(xmmA, 0);
    cc.psubsb(xmmA, xmmB);
    cc.psubsw(xmmA, xmmB);
    cc.psubusb(xmmA, xmmB);
    cc.psubusw(xmmA, xmmB);
    cc.punpckhbw(xmmA, xmmB);
    cc.punpckhwd(xmmA, xmmB);
    cc.punpckhdq(xmmA, xmmB);
    cc.punpckhqdq(xmmA, xmmB);
    cc.punpcklbw(xmmA, xmmB);
    cc.punpcklwd(xmmA, xmmB);
    cc.punpckldq(xmmA, xmmB);
    cc.punpcklqdq(xmmA, xmmB);
    cc.pxor(xmmA, xmmB);
    cc.sqrtpd(xmmA, xmmB);
    cc.sqrtsd(xmmA, xmmB);
    cc.subpd(xmmA, xmmB);
    cc.subsd(xmmA, xmmB);
    cc.ucomisd(xmmA, xmmB);
    cc.unpckhpd(xmmA, xmmB);
    cc.unpcklpd(xmmA, xmmB);
    cc.xorpd(xmmA, xmmB);

    // SSE3.
    cc.addsubpd(xmmA, xmmB);
    cc.addsubps(xmmA, xmmB);
    cc.haddpd(xmmA, xmmB);
    cc.haddps(xmmA, xmmB);
    cc.hsubpd(xmmA, xmmB);
    cc.hsubps(xmmA, xmmB);
    cc.movddup(xmmA, xmmB);
    cc.movshdup(xmmA, xmmB);
    cc.movsldup(xmmA, xmmB);

    // SSSE3.
    cc.psignb(xmmA, xmmB);
    cc.psignw(xmmA, xmmB);
    cc.psignd(xmmA, xmmB);
    cc.phaddw(xmmA, xmmB);
    cc.phaddd(xmmA, xmmB);
    cc.phaddsw(xmmA, xmmB);
    cc.phsubw(xmmA, xmmB);
    cc.phsubd(xmmA, xmmB);
    cc.phsubsw(xmmA, xmmB);
    cc.pmaddubsw(xmmA, xmmB);
    cc.pabsb(xmmA, xmmB);
    cc.pabsw(xmmA, xmmB);
    cc.pabsd(xmmA, xmmB);
    cc.pmulhrsw(xmmA, xmmB);
    cc.pshufb(xmmA, xmmB);
    cc.palignr(xmmA, xmmB, 0);

    // SSE4.1.
    cc.blendpd(xmmA, xmmB, 0);
    cc.blendps(xmmA, xmmB, 0);
    cc.blendvpd(xmmA, xmmB, xmmA);
    cc.blendvps(xmmA, xmmB, xmmA);

    cc.dppd(xmmA, xmmB, 0);
    cc.dpps(xmmA, xmmB, 0);
    cc.extractps(gpd, xmmB, 0);
    cc.insertps(xmmA, xmmB, 0);
    cc.mpsadbw(xmmA, xmmB, 0);
    cc.packusdw(xmmA, xmmB);
    cc.pblendvb(xmmA, xmmB, xmmA);
    cc.pblendw(xmmA, xmmB, 0);
    cc.pcmpeqq(xmmA, xmmB);
    cc.pextrb(gpd, xmmB, 0);
    cc.pextrd(gpd, xmmB, 0);
    if (cc.is64Bit()) cc.pextrq(gpq, xmmB, 0);
    cc.pextrw(gpd, xmmB, 0);
    cc.phminposuw(xmmA, xmmB);
    cc.pinsrb(xmmA, gpd, 0);
    cc.pinsrd(xmmA, gpd, 0);
    cc.pinsrw(xmmA, gpd, 0);
    cc.pmaxuw(xmmA, xmmB);
    cc.pmaxsb(xmmA, xmmB);
    cc.pmaxsd(xmmA, xmmB);
    cc.pmaxud(xmmA, xmmB);
    cc.pminsb(xmmA, xmmB);
    cc.pminuw(xmmA, xmmB);
    cc.pminud(xmmA, xmmB);
    cc.pminsd(xmmA, xmmB);
    cc.pmovsxbw(xmmA, xmmB);
    cc.pmovsxbd(xmmA, xmmB);
    cc.pmovsxbq(xmmA, xmmB);
    cc.pmovsxwd(xmmA, xmmB);
    cc.pmovsxwq(xmmA, xmmB);
    cc.pmovsxdq(xmmA, xmmB);
    cc.pmovzxbw(xmmA, xmmB);
    cc.pmovzxbd(xmmA, xmmB);
    cc.pmovzxbq(xmmA, xmmB);
    cc.pmovzxwd(xmmA, xmmB);
    cc.pmovzxwq(xmmA, xmmB);
    cc.pmovzxdq(xmmA, xmmB);
    cc.pmuldq(xmmA, xmmB);
    cc.pmulld(xmmA, xmmB);
    cc.ptest(xmmA, xmmB);
    cc.roundps(xmmA, xmmB, 0);
    cc.roundss(xmmA, xmmB, 0);
    cc.roundpd(xmmA, xmmB, 0);
    cc.roundsd(xmmA, xmmB, 0);
  }
  else {
    x86::Mem m = x86::ptr(gpz);

    cc.addps(xmmA, m);
    cc.addss(xmmA, m);
    cc.andnps(xmmA, m);
    cc.andps(xmmA, m);
    cc.cmpps(xmmA, m, 0);
    cc.cmpss(xmmA, m, 0);
    cc.comiss(xmmA, m);
    cc.cvtpi2ps(xmmA, m);
    cc.cvtsi2ss(xmmA, m);
    cc.cvtss2si(gpd, m);
    cc.cvtss2si(gpz, m);
    cc.cvttss2si(gpd, m);
    cc.cvttss2si(gpz, m);
    cc.divps(xmmA, m);
    cc.divss(xmmA, m);
    cc.maxps(xmmA, m);
    cc.maxss(xmmA, m);
    cc.minps(xmmA, m);
    cc.minss(xmmA, m);
    cc.movaps(xmmA, m);
    cc.movaps(m, xmmB);
    cc.movd(m, xmmB);
    cc.movd(xmmA, m);
    cc.movq(m, xmmB);
    cc.movq(xmmA, m);
    cc.movhps(xmmA, m);
    cc.movhps(m, xmmB);
    cc.movlps(xmmA, m);
    cc.movlps(m, xmmB);
    cc.movntps(m, xmmB);
    cc.movss(xmmA, m);
    cc.movss(m, xmmB);
    cc.movups(xmmA, m);
    cc.movups(m, xmmB);
    cc.mulps(xmmA, m);
    cc.mulss(xmmA, m);
    cc.orps(xmmA, m);
    cc.rcpps(xmmA, m);
    cc.rcpss(xmmA, m);
    cc.psadbw(xmmA, m);
    cc.rsqrtps(xmmA, m);
    cc.rsqrtss(xmmA, m);
    cc.shufps(xmmA, m, 0);
    cc.sqrtps(xmmA, m);
    cc.sqrtss(xmmA, m);
    cc.stmxcsr(m);
    cc.subps(xmmA, m);
    cc.subss(xmmA, m);
    cc.ucomiss(xmmA, m);
    cc.unpckhps(xmmA, m);
    cc.unpcklps(xmmA, m);
    cc.xorps(xmmA, m);

    // SSE2.
    cc.addpd(xmmA, m);
    cc.addsd(xmmA, m);
    cc.andnpd(xmmA, m);
    cc.andpd(xmmA, m);
    cc.cmppd(xmmA, m, 0);
    cc.cmpsd(xmmA, m, 0);
    cc.comisd(xmmA, m);
    cc.cvtdq2pd(xmmA, m);
    cc.cvtdq2ps(xmmA, m);
    cc.cvtpd2dq(xmmA, m);
    cc.cvtpd2ps(xmmA, m);
    cc.cvtpi2pd(xmmA, m);
    cc.cvtps2dq(xmmA, m);
    cc.cvtps2pd(xmmA, m);
    cc.cvtsd2si(gpd, m);
    cc.cvtsd2si(gpz, m);
    cc.cvtsd2ss(xmmA, m);
    cc.cvtsi2sd(xmmA, m);
    cc.cvtss2sd(xmmA, m);
    cc.cvtss2si(gpd, m);
    cc.cvtss2si(gpz, m);
    cc.cvttpd2dq(xmmA, m);
    cc.cvttps2dq(xmmA, m);
    cc.cvttsd2si(gpd, m);
    cc.cvttsd2si(gpz, m);
    cc.divpd(xmmA, m);
    cc.divsd(xmmA, m);
    cc.maxpd(xmmA, m);
    cc.maxsd(xmmA, m);
    cc.minpd(xmmA, m);
    cc.minsd(xmmA, m);
    cc.movdqa(xmmA, m);
    cc.movdqa(m, xmmB);
    cc.movdqu(xmmA, m);
    cc.movdqu(m, xmmB);
    cc.movsd(xmmA, m);
    cc.movsd(m, xmmB);
    cc.movapd(xmmA, m);
    cc.movapd(m, xmmB);
    cc.movhpd(xmmA, m);
    cc.movhpd(m, xmmB);
    cc.movlpd(xmmA, m);
    cc.movlpd(m, xmmB);
    cc.movntdq(m, xmmB);
    cc.movntpd(m, xmmB);
    cc.movupd(xmmA, m);
    cc.movupd(m, xmmB);
    cc.mulpd(xmmA, m);
    cc.mulsd(xmmA, m);
    cc.orpd(xmmA, m);
    cc.packsswb(xmmA, m);
    cc.packssdw(xmmA, m);
    cc.packuswb(xmmA, m);
    cc.paddb(xmmA, m);
    cc.paddw(xmmA, m);
    cc.paddd(xmmA, m);
    cc.paddq(xmmA, m);
    cc.paddsb(xmmA, m);
    cc.paddsw(xmmA, m);
    cc.paddusb(xmmA, m);
    cc.paddusw(xmmA, m);
    cc.pand(xmmA, m);
    cc.pandn(xmmA, m);
    cc.pavgb(xmmA, m);
    cc.pavgw(xmmA, m);
    cc.pcmpeqb(xmmA, m);
    cc.pcmpeqw(xmmA, m);
    cc.pcmpeqd(xmmA, m);
    cc.pcmpgtb(xmmA, m);
    cc.pcmpgtw(xmmA, m);
    cc.pcmpgtd(xmmA, m);
    cc.pmaxsw(xmmA, m);
    cc.pmaxub(xmmA, m);
    cc.pminsw(xmmA, m);
    cc.pminub(xmmA, m);
    cc.pmulhw(xmmA, m);
    cc.pmulhuw(xmmA, m);
    cc.pmullw(xmmA, m);
    cc.pmuludq(xmmA, m);
    cc.por(xmmA, m);
    cc.pslld(xmmA, m);
    cc.psllq(xmmA, m);
    cc.psllw(xmmA, m);
    cc.psrad(xmmA, m);
    cc.psraw(xmmA, m);
    cc.psubb(xmmA, m);
    cc.psubw(xmmA, m);
    cc.psubd(xmmA, m);
    cc.psubq(xmmA, m);
    cc.pmaddwd(xmmA, m);
    cc.pshufd(xmmA, m, 0);
    cc.pshufhw(xmmA, m, 0);
    cc.pshuflw(xmmA, m, 0);
    cc.psrld(xmmA, m);
    cc.psrlq(xmmA, m);
    cc.psrlw(xmmA, m);
    cc.psubsb(xmmA, m);
    cc.psubsw(xmmA, m);
    cc.psubusb(xmmA, m);
    cc.psubusw(xmmA, m);
    cc.punpckhbw(xmmA, m);
    cc.punpckhwd(xmmA, m);
    cc.punpckhdq(xmmA, m);
    cc.punpckhqdq(xmmA, m);
    cc.punpcklbw(xmmA, m);
    cc.punpcklwd(xmmA, m);
    cc.punpckldq(xmmA, m);
    cc.punpcklqdq(xmmA, m);
    cc.pxor(xmmA, m);
    cc.sqrtpd(xmmA, m);
    cc.sqrtsd(xmmA, m);
    cc.subpd(xmmA, m);
    cc.subsd(xmmA, m);
    cc.ucomisd(xmmA, m);
    cc.unpckhpd(xmmA, m);
    cc.unpcklpd(xmmA, m);
    cc.xorpd(xmmA, m);

    // SSE3.
    cc.addsubpd(xmmA, m);
    cc.addsubps(xmmA, m);
    cc.haddpd(xmmA, m);
    cc.haddps(xmmA, m);
    cc.hsubpd(xmmA, m);
    cc.hsubps(xmmA, m);
    cc.lddqu(xmmA, m);
    cc.movddup(xmmA, m);
    cc.movshdup(xmmA, m);
    cc.movsldup(xmmA, m);

    // SSSE3.
    cc.psignb(xmmA, m);
    cc.psignw(xmmA, m);
    cc.psignd(xmmA, m);
    cc.phaddw(xmmA, m);
    cc.phaddd(xmmA, m);
    cc.phaddsw(xmmA, m);
    cc.phsubw(xmmA, m);
    cc.phsubd(xmmA, m);
    cc.phsubsw(xmmA, m);
    cc.pmaddubsw(xmmA, m);
    cc.pabsb(xmmA, m);
    cc.pabsw(xmmA, m);
    cc.pabsd(xmmA, m);
    cc.pmulhrsw(xmmA, m);
    cc.pshufb(xmmA, m);
    cc.palignr(xmmA, m, 0);

    // SSE4.1.
    cc.blendpd(xmmA, m, 0);
    cc.blendps(xmmA, m, 0);
    cc.blendvpd(xmmA, m, xmmA);
    cc.blendvps(xmmA, m, xmmA);

    cc.dppd(xmmA, m, 0);
    cc.dpps(xmmA, m, 0);
    cc.extractps(m, xmmB, 0);
    cc.insertps(xmmA, m, 0);
    cc.movntdqa(xmmA, m);
    cc.mpsadbw(xmmA, m, 0);
    cc.packusdw(xmmA, m);
    cc.pblendvb(xmmA, m, xmmA);
    cc.pblendw(xmmA, m, 0);
    cc.pcmpeqq(xmmA, m);
    cc.pextrb(m, xmmB, 0);
    cc.pextrd(m, xmmB, 0);
    if (cc.is64Bit()) cc.pextrq(m, xmmB, 0);
    cc.pextrw(m, xmmB, 0);
    cc.phminposuw(xmmA, m);
    cc.pinsrb(xmmA, m, 0);
    cc.pinsrd(xmmA, m, 0);
    cc.pinsrw(xmmA, m, 0);
    cc.pmaxuw(xmmA, m);
    cc.pmaxsb(xmmA, m);
    cc.pmaxsd(xmmA, m);
    cc.pmaxud(xmmA, m);
    cc.pminsb(xmmA, m);
    cc.pminuw(xmmA, m);
    cc.pminud(xmmA, m);
    cc.pminsd(xmmA, m);
    cc.pmovsxbw(xmmA, m);
    cc.pmovsxbd(xmmA, m);
    cc.pmovsxbq(xmmA, m);
    cc.pmovsxwd(xmmA, m);
    cc.pmovsxwq(xmmA, m);
    cc.pmovsxdq(xmmA, m);
    cc.pmovzxbw(xmmA, m);
    cc.pmovzxbd(xmmA, m);
    cc.pmovzxbq(xmmA, m);
    cc.pmovzxwd(xmmA, m);
    cc.pmovzxwq(xmmA, m);
    cc.pmovzxdq(xmmA, m);
    cc.pmuldq(xmmA, m);
    cc.pmulld(xmmA, m);
    cc.ptest(xmmA, m);
    cc.roundps(xmmA, m, 0);
    cc.roundss(xmmA, m, 0);
    cc.roundpd(xmmA, m, 0);
    cc.roundsd(xmmA, m, 0);

    // SSE4.2.
    cc.pcmpgtq(xmmA, m);
  }
}

static void generateSseSequence(BaseEmitter& emitter, InstForm form, bool emitPrologEpilog) {
  using namespace asmjit::x86;

  if (emitter.isAssembler()) {
    Assembler& cc = *emitter.as<Assembler>();

    if (emitPrologEpilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.addDirtyRegs(eax, xmm0, xmm1, xmm2, xmm3);
      frame.finalize();

      cc.emitProlog(frame);
      generateSseSequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
      cc.emitEpilog(frame);
    }
    else {
      generateSseSequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
    }
  }
#ifndef ASMJIT_NO_BUILDER
  else if (emitter.isBuilder()) {
    Builder& cc = *emitter.as<Builder>();

    if (emitPrologEpilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.addDirtyRegs(eax, xmm0, xmm1, xmm2, xmm3);
      frame.finalize();

      cc.emitProlog(frame);
      generateSseSequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
      cc.emitEpilog(frame);
    }
    else {
      generateSseSequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
    }
  }
#endif
#ifndef ASMJIT_NO_COMPILER
  else if (emitter.isCompiler()) {
    Compiler& cc = *emitter.as<Compiler>();

    Gp gp = cc.newGpz("gp");
    Xmm a = cc.newXmm("a");
    Xmm b = cc.newXmm("b");
    Xmm c = cc.newXmm("c");
    Xmm d = cc.newXmm("d");

    cc.addFunc(FuncSignature::build<void>());
    generateSseSequenceInternal(cc, form, gp, a, b, c, d);
    cc.endFunc();
  }
#endif
}

// Generates a long sequence of AVX instructions.
template<typename Emitter>
static void generateAvxSequenceInternalRegOnly(
  Emitter& cc,
  const x86::Gp& gp,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {

  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is32Bit() ? gpd : gpq;

  x86::Xmm xmmA = vecA.xmm();
  x86::Xmm xmmB = vecB.xmm();
  x86::Xmm xmmC = vecC.xmm();
  x86::Xmm xmmD = vecD.xmm();

  x86::Ymm ymmA = vecA.ymm();
  x86::Ymm ymmB = vecB.ymm();
  x86::Ymm ymmC = vecC.ymm();

  cc.xor_(gpd, gpd);
  cc.vxorps(xmmA, xmmA, xmmA);
  cc.vxorps(xmmB, xmmB, xmmB);
  cc.vxorps(xmmC, xmmC, xmmC);
  cc.vxorps(xmmD, xmmD, xmmD);

  cc.vaddpd(xmmA, xmmB, xmmC);
  cc.vaddpd(ymmA, ymmB, ymmC);
  cc.vaddps(xmmA, xmmB, xmmC);
  cc.vaddps(ymmA, ymmB, ymmC);
  cc.vaddsd(xmmA, xmmB, xmmC);
  cc.vaddss(xmmA, xmmB, xmmC);
  cc.vaddsubpd(xmmA, xmmB, xmmC);
  cc.vaddsubpd(ymmA, ymmB, ymmC);
  cc.vaddsubps(xmmA, xmmB, xmmC);
  cc.vaddsubps(ymmA, ymmB, ymmC);
  cc.vandpd(xmmA, xmmB, xmmC);
  cc.vandpd(ymmA, ymmB, ymmC);
  cc.vandps(xmmA, xmmB, xmmC);
  cc.vandps(ymmA, ymmB, ymmC);
  cc.vandnpd(xmmA, xmmB, xmmC);
  cc.vandnpd(ymmA, ymmB, ymmC);
  cc.vandnps(xmmA, xmmB, xmmC);
  cc.vandnps(ymmA, ymmB, ymmC);
  cc.vblendpd(xmmA, xmmB, xmmC, 0);
  cc.vblendpd(ymmA, ymmB, ymmC, 0);
  cc.vblendps(xmmA, xmmB, xmmC, 0);
  cc.vblendps(ymmA, ymmB, ymmC, 0);
  cc.vblendvpd(xmmA, xmmB, xmmC, xmmA);
  cc.vblendvpd(ymmA, ymmB, ymmC, ymmA);
  cc.vcmppd(xmmA, xmmB, xmmC, 0);
  cc.vcmppd(ymmA, ymmB, ymmC, 0);
  cc.vcmpps(xmmA, xmmB, xmmC, 0);
  cc.vcmpps(ymmA, ymmB, ymmC, 0);
  cc.vcmpsd(xmmA, xmmB, xmmC, 0);
  cc.vcmpss(xmmA, xmmB, xmmC, 0);
  cc.vcomisd(xmmA, xmmB);
  cc.vcomiss(xmmA, xmmB);
  cc.vcvtdq2pd(xmmA, xmmB);
  cc.vcvtdq2pd(ymmA, xmmB);
  cc.vcvtdq2ps(xmmA, xmmB);
  cc.vcvtdq2ps(ymmA, ymmB);
  cc.vcvtpd2dq(xmmA, xmmB);
  cc.vcvtpd2dq(xmmA, ymmB);
  cc.vcvtpd2ps(xmmA, xmmB);
  cc.vcvtpd2ps(xmmA, ymmB);
  cc.vcvtps2dq(xmmA, xmmB);
  cc.vcvtps2dq(ymmA, ymmB);
  cc.vcvtps2pd(xmmA, xmmB);
  cc.vcvtps2pd(ymmA, xmmB);
  cc.vcvtsd2si(gpd, xmmB);
  cc.vcvtsd2si(gpz, xmmB);
  cc.vcvtsd2ss(xmmA, xmmB, xmmC);
  cc.vcvtsi2sd(xmmA, xmmB, gpd);
  cc.vcvtsi2sd(xmmA, xmmB, gpz);
  cc.vcvtsi2ss(xmmA, xmmB, gpd);
  cc.vcvtsi2ss(xmmA, xmmB, gpz);
  cc.vcvtss2sd(xmmA, xmmB, xmmC);
  cc.vcvtss2si(gpd, xmmB);
  cc.vcvttpd2dq(xmmA, xmmB);
  cc.vcvttpd2dq(xmmA, ymmB);
  cc.vcvttps2dq(xmmA, xmmB);
  cc.vcvttps2dq(ymmA, ymmB);
  cc.vcvttsd2si(gpd, xmmB);
  cc.vcvttss2si(gpz, xmmB);
  cc.vdivpd(xmmA, xmmB, xmmC);
  cc.vdivpd(ymmA, ymmB, ymmC);
  cc.vdivps(xmmA, xmmB, xmmC);
  cc.vdivps(ymmA, ymmB, ymmC);
  cc.vdivsd(xmmA, xmmB, xmmC);
  cc.vdivss(xmmA, xmmB, xmmC);
  cc.vdppd(xmmA, xmmB, xmmC, 0);
  cc.vdpps(xmmA, xmmB, xmmC, 0);
  cc.vdpps(ymmA, ymmB, ymmC, 0);
  cc.vextractf128(xmmA, ymmB, 0);
  cc.vextractps(gpd, xmmB, 0);
  cc.vhaddpd(xmmA, xmmB, xmmC);
  cc.vhaddpd(ymmA, ymmB, ymmC);
  cc.vhaddps(xmmA, xmmB, xmmC);
  cc.vhaddps(ymmA, ymmB, ymmC);
  cc.vhsubpd(xmmA, xmmB, xmmC);
  cc.vhsubpd(ymmA, ymmB, ymmC);
  cc.vhsubps(xmmA, xmmB, xmmC);
  cc.vhsubps(ymmA, ymmB, ymmC);
  cc.vinsertf128(ymmA, ymmB, xmmC, 0);
  cc.vinsertps(xmmA, xmmB, xmmC, 0);
  cc.vmaxpd(xmmA, xmmB, xmmC);
  cc.vmaxpd(ymmA, ymmB, ymmC);
  cc.vmaxps(xmmA, xmmB, xmmC);
  cc.vmaxps(ymmA, ymmB, ymmC);
  cc.vmaxsd(xmmA, xmmB, xmmC);
  cc.vmaxss(xmmA, xmmB, xmmC);
  cc.vminpd(xmmA, xmmB, xmmC);
  cc.vminpd(ymmA, ymmB, ymmC);
  cc.vminps(xmmA, xmmB, xmmC);
  cc.vminps(ymmA, ymmB, ymmC);
  cc.vminsd(xmmA, xmmB, xmmC);
  cc.vminss(xmmA, xmmB, xmmC);
  cc.vmovapd(xmmA, xmmB);
  cc.vmovapd(ymmA, ymmB);
  cc.vmovaps(xmmA, xmmB);
  cc.vmovaps(ymmA, ymmB);
  cc.vmovd(xmmA, gpd);
  cc.vmovd(gpd, xmmB);
  cc.vmovddup(xmmA, xmmB);
  cc.vmovddup(ymmA, ymmB);
  cc.vmovdqa(xmmA, xmmB);
  cc.vmovdqa(ymmA, ymmB);
  cc.vmovdqu(xmmA, xmmB);
  cc.vmovdqu(ymmA, ymmB);
  cc.vmovhlps(xmmA, xmmB, xmmC);
  cc.vmovlhps(xmmA, xmmB, xmmC);
  cc.vmovmskpd(gpd, xmmB);
  cc.vmovmskpd(gpd, ymmB);
  cc.vmovmskps(gpd, xmmB);
  cc.vmovmskps(gpd, ymmB);
  cc.vmovsd(xmmA, xmmB, xmmC);
  cc.vmovshdup(xmmA, xmmB);
  cc.vmovshdup(ymmA, ymmB);
  cc.vmovsldup(xmmA, xmmB);
  cc.vmovsldup(ymmA, ymmB);
  cc.vmovss(xmmA, xmmB, xmmC);
  cc.vmovupd(xmmA, xmmB);
  cc.vmovupd(ymmA, ymmB);
  cc.vmovups(xmmA, xmmB);
  cc.vmovups(ymmA, ymmB);
  cc.vmpsadbw(xmmA, xmmB, xmmC, 0);
  cc.vmulpd(xmmA, xmmB, xmmC);
  cc.vmulpd(ymmA, ymmB, ymmC);
  cc.vmulps(xmmA, xmmB, xmmC);
  cc.vmulps(ymmA, ymmB, ymmC);
  cc.vmulsd(xmmA, xmmB, xmmC);
  cc.vmulss(xmmA, xmmB, xmmC);
  cc.vorpd(xmmA, xmmB, xmmC);
  cc.vorpd(ymmA, ymmB, ymmC);
  cc.vorps(xmmA, xmmB, xmmC);
  cc.vorps(ymmA, ymmB, ymmC);
  cc.vpabsb(xmmA, xmmB);
  cc.vpabsd(xmmA, xmmB);
  cc.vpabsw(xmmA, xmmB);
  cc.vpackssdw(xmmA, xmmB, xmmC);
  cc.vpacksswb(xmmA, xmmB, xmmC);
  cc.vpackusdw(xmmA, xmmB, xmmC);
  cc.vpackuswb(xmmA, xmmB, xmmC);
  cc.vpaddb(xmmA, xmmB, xmmC);
  cc.vpaddd(xmmA, xmmB, xmmC);
  cc.vpaddq(xmmA, xmmB, xmmC);
  cc.vpaddw(xmmA, xmmB, xmmC);
  cc.vpaddsb(xmmA, xmmB, xmmC);
  cc.vpaddsw(xmmA, xmmB, xmmC);
  cc.vpaddusb(xmmA, xmmB, xmmC);
  cc.vpaddusw(xmmA, xmmB, xmmC);
  cc.vpalignr(xmmA, xmmB, xmmC, 0);
  cc.vpand(xmmA, xmmB, xmmC);
  cc.vpandn(xmmA, xmmB, xmmC);
  cc.vpavgb(xmmA, xmmB, xmmC);
  cc.vpavgw(xmmA, xmmB, xmmC);
  cc.vpblendvb(xmmA, xmmB, xmmC, xmmA);
  cc.vpblendw(xmmA, xmmB, xmmC, 0);
  cc.vpcmpeqb(xmmA, xmmB, xmmC);
  cc.vpcmpeqd(xmmA, xmmB, xmmC);
  cc.vpcmpeqq(xmmA, xmmB, xmmC);
  cc.vpcmpeqw(xmmA, xmmB, xmmC);
  cc.vpcmpgtb(xmmA, xmmB, xmmC);
  cc.vpcmpgtd(xmmA, xmmB, xmmC);
  cc.vpcmpgtq(xmmA, xmmB, xmmC);
  cc.vpcmpgtw(xmmA, xmmB, xmmC);
  cc.vpermilpd(xmmA, xmmB, xmmC);
  cc.vpermilpd(ymmA, ymmB, ymmC);
  cc.vpermilpd(xmmA, xmmB, 0);
  cc.vpermilpd(ymmA, ymmB, 0);
  cc.vpermilps(xmmA, xmmB, xmmC);
  cc.vpermilps(ymmA, ymmB, ymmC);
  cc.vpermilps(xmmA, xmmB, 0);
  cc.vpermilps(ymmA, ymmB, 0);
  cc.vperm2f128(ymmA, ymmB, ymmC, 0);
  cc.vpextrb(gpd, xmmB, 0);
  cc.vpextrd(gpd, xmmB, 0);
  if (cc.is64Bit()) cc.vpextrq(gpq, xmmB, 0);
  cc.vpextrw(gpd, xmmB, 0);
  cc.vphaddd(xmmA, xmmB, xmmC);
  cc.vphaddsw(xmmA, xmmB, xmmC);
  cc.vphaddw(xmmA, xmmB, xmmC);
  cc.vphminposuw(xmmA, xmmB);
  cc.vphsubd(xmmA, xmmB, xmmC);
  cc.vphsubsw(xmmA, xmmB, xmmC);
  cc.vphsubw(xmmA, xmmB, xmmC);
  cc.vpinsrb(xmmA, xmmB, gpd, 0);
  cc.vpinsrd(xmmA, xmmB, gpd, 0);
  cc.vpinsrw(xmmA, xmmB, gpd, 0);
  cc.vpmaddubsw(xmmA, xmmB, xmmC);
  cc.vpmaddwd(xmmA, xmmB, xmmC);
  cc.vpmaxsb(xmmA, xmmB, xmmC);
  cc.vpmaxsd(xmmA, xmmB, xmmC);
  cc.vpmaxsw(xmmA, xmmB, xmmC);
  cc.vpmaxub(xmmA, xmmB, xmmC);
  cc.vpmaxud(xmmA, xmmB, xmmC);
  cc.vpmaxuw(xmmA, xmmB, xmmC);
  cc.vpminsb(xmmA, xmmB, xmmC);
  cc.vpminsd(xmmA, xmmB, xmmC);
  cc.vpminsw(xmmA, xmmB, xmmC);
  cc.vpminub(xmmA, xmmB, xmmC);
  cc.vpminud(xmmA, xmmB, xmmC);
  cc.vpminuw(xmmA, xmmB, xmmC);
  cc.vpmovmskb(gpd, xmmB);
  cc.vpmovsxbd(xmmA, xmmB);
  cc.vpmovsxbq(xmmA, xmmB);
  cc.vpmovsxbw(xmmA, xmmB);
  cc.vpmovsxdq(xmmA, xmmB);
  cc.vpmovsxwd(xmmA, xmmB);
  cc.vpmovsxwq(xmmA, xmmB);
  cc.vpmovzxbd(xmmA, xmmB);
  cc.vpmovzxbq(xmmA, xmmB);
  cc.vpmovzxbw(xmmA, xmmB);
  cc.vpmovzxdq(xmmA, xmmB);
  cc.vpmovzxwd(xmmA, xmmB);
  cc.vpmovzxwq(xmmA, xmmB);
  cc.vpmuldq(xmmA, xmmB, xmmC);
  cc.vpmulhrsw(xmmA, xmmB, xmmC);
  cc.vpmulhuw(xmmA, xmmB, xmmC);
  cc.vpmulhw(xmmA, xmmB, xmmC);
  cc.vpmulld(xmmA, xmmB, xmmC);
  cc.vpmullw(xmmA, xmmB, xmmC);
  cc.vpmuludq(xmmA, xmmB, xmmC);
  cc.vpor(xmmA, xmmB, xmmC);
  cc.vpsadbw(xmmA, xmmB, xmmC);
  cc.vpshufb(xmmA, xmmB, xmmC);
  cc.vpshufd(xmmA, xmmB, 0);
  cc.vpshufhw(xmmA, xmmB, 0);
  cc.vpshuflw(xmmA, xmmB, 0);
  cc.vpsignb(xmmA, xmmB, xmmC);
  cc.vpsignd(xmmA, xmmB, xmmC);
  cc.vpsignw(xmmA, xmmB, xmmC);
  cc.vpslld(xmmA, xmmB, xmmC);
  cc.vpslld(xmmA, xmmB, 0);
  cc.vpslldq(xmmA, xmmB, 0);
  cc.vpsllq(xmmA, xmmB, xmmC);
  cc.vpsllq(xmmA, xmmB, 0);
  cc.vpsllw(xmmA, xmmB, xmmC);
  cc.vpsllw(xmmA, xmmB, 0);
  cc.vpsrad(xmmA, xmmB, xmmC);
  cc.vpsrad(xmmA, xmmB, 0);
  cc.vpsraw(xmmA, xmmB, xmmC);
  cc.vpsraw(xmmA, xmmB, 0);
  cc.vpsrld(xmmA, xmmB, xmmC);
  cc.vpsrld(xmmA, xmmB, 0);
  cc.vpsrldq(xmmA, xmmB, 0);
  cc.vpsrlq(xmmA, xmmB, xmmC);
  cc.vpsrlq(xmmA, xmmB, 0);
  cc.vpsrlw(xmmA, xmmB, xmmC);
  cc.vpsrlw(xmmA, xmmB, 0);
  cc.vpsubb(xmmA, xmmB, xmmC);
  cc.vpsubd(xmmA, xmmB, xmmC);
  cc.vpsubq(xmmA, xmmB, xmmC);
  cc.vpsubw(xmmA, xmmB, xmmC);
  cc.vpsubsb(xmmA, xmmB, xmmC);
  cc.vpsubsw(xmmA, xmmB, xmmC);
  cc.vpsubusb(xmmA, xmmB, xmmC);
  cc.vpsubusw(xmmA, xmmB, xmmC);
  cc.vptest(xmmA, xmmB);
  cc.vptest(ymmA, ymmB);
  cc.vpunpckhbw(xmmA, xmmB, xmmC);
  cc.vpunpckhdq(xmmA, xmmB, xmmC);
  cc.vpunpckhqdq(xmmA, xmmB, xmmC);
  cc.vpunpckhwd(xmmA, xmmB, xmmC);
  cc.vpunpcklbw(xmmA, xmmB, xmmC);
  cc.vpunpckldq(xmmA, xmmB, xmmC);
  cc.vpunpcklqdq(xmmA, xmmB, xmmC);
  cc.vpunpcklwd(xmmA, xmmB, xmmC);
  cc.vpxor(xmmA, xmmB, xmmC);
  cc.vrcpps(xmmA, xmmB);
  cc.vrcpps(ymmA, ymmB);
  cc.vrcpss(xmmA, xmmB, xmmC);
  cc.vrsqrtps(xmmA, xmmB);
  cc.vrsqrtps(ymmA, ymmB);
  cc.vrsqrtss(xmmA, xmmB, xmmC);
  cc.vroundpd(xmmA, xmmB, 0);
  cc.vroundpd(ymmA, ymmB, 0);
  cc.vroundps(xmmA, xmmB, 0);
  cc.vroundps(ymmA, ymmB, 0);
  cc.vroundsd(xmmA, xmmB, xmmC, 0);
  cc.vroundss(xmmA, xmmB, xmmC, 0);
  cc.vshufpd(xmmA, xmmB, xmmC, 0);
  cc.vshufpd(ymmA, ymmB, ymmC, 0);
  cc.vshufps(xmmA, xmmB, xmmC, 0);
  cc.vshufps(ymmA, ymmB, ymmC, 0);
  cc.vsqrtpd(xmmA, xmmB);
  cc.vsqrtpd(ymmA, ymmB);
  cc.vsqrtps(xmmA, xmmB);
  cc.vsqrtps(ymmA, ymmB);
  cc.vsqrtsd(xmmA, xmmB, xmmC);
  cc.vsqrtss(xmmA, xmmB, xmmC);
  cc.vsubpd(xmmA, xmmB, xmmC);
  cc.vsubpd(ymmA, ymmB, ymmC);
  cc.vsubps(xmmA, xmmB, xmmC);
  cc.vsubps(ymmA, ymmB, ymmC);
  cc.vsubsd(xmmA, xmmB, xmmC);
  cc.vsubss(xmmA, xmmB, xmmC);
  cc.vtestps(xmmA, xmmB);
  cc.vtestps(ymmA, ymmB);
  cc.vtestpd(xmmA, xmmB);
  cc.vtestpd(ymmA, ymmB);
  cc.vucomisd(xmmA, xmmB);
  cc.vucomiss(xmmA, xmmB);
  cc.vunpckhpd(xmmA, xmmB, xmmC);
  cc.vunpckhpd(ymmA, ymmB, ymmC);
  cc.vunpckhps(xmmA, xmmB, xmmC);
  cc.vunpckhps(ymmA, ymmB, ymmC);
  cc.vunpcklpd(xmmA, xmmB, xmmC);
  cc.vunpcklpd(ymmA, ymmB, ymmC);
  cc.vunpcklps(xmmA, xmmB, xmmC);
  cc.vunpcklps(ymmA, ymmB, ymmC);
  cc.vxorpd(xmmA, xmmB, xmmC);
  cc.vxorpd(ymmA, ymmB, ymmC);
  cc.vxorps(xmmA, xmmB, xmmC);
  cc.vxorps(ymmA, ymmB, ymmC);

  // AVX+AESNI.
  cc.vaesdec(xmmA, xmmB, xmmC);
  cc.vaesdeclast(xmmA, xmmB, xmmC);
  cc.vaesenc(xmmA, xmmB, xmmC);
  cc.vaesenclast(xmmA, xmmB, xmmC);
  cc.vaesimc(xmmA, xmmB);
  cc.vaeskeygenassist(xmmA, xmmB, 0);

  // AVX+PCLMULQDQ.
  cc.vpclmulqdq(xmmA, xmmB, xmmC, 0);

  // AVX2.
  cc.vbroadcastsd(ymmA, xmmB);
  cc.vbroadcastss(xmmA, xmmB);
  cc.vbroadcastss(ymmA, xmmB);
  cc.vextracti128(xmmA, ymmB, 0);
  cc.vinserti128(ymmA, ymmB, xmmC, 0);
  cc.vmpsadbw(ymmA, ymmB, ymmC, 0);
  cc.vpabsb(ymmA, ymmB);
  cc.vpabsd(ymmA, ymmB);
  cc.vpabsw(ymmA, ymmB);
  cc.vpackssdw(ymmA, ymmB, ymmC);
  cc.vpacksswb(ymmA, ymmB, ymmC);
  cc.vpackusdw(ymmA, ymmB, ymmC);
  cc.vpackuswb(ymmA, ymmB, ymmC);
  cc.vpaddb(ymmA, ymmB, ymmC);
  cc.vpaddd(ymmA, ymmB, ymmC);
  cc.vpaddq(ymmA, ymmB, ymmC);
  cc.vpaddw(ymmA, ymmB, ymmC);
  cc.vpaddsb(ymmA, ymmB, ymmC);
  cc.vpaddsw(ymmA, ymmB, ymmC);
  cc.vpaddusb(ymmA, ymmB, ymmC);
  cc.vpaddusw(ymmA, ymmB, ymmC);
  cc.vpalignr(ymmA, ymmB, ymmC, 0);
  cc.vpand(ymmA, ymmB, ymmC);
  cc.vpandn(ymmA, ymmB, ymmC);
  cc.vpavgb(ymmA, ymmB, ymmC);
  cc.vpavgw(ymmA, ymmB, ymmC);
  cc.vpblendd(xmmA, xmmB, xmmC, 0);
  cc.vpblendd(ymmA, ymmB, ymmC, 0);
  cc.vpblendvb(ymmA, ymmB, ymmC, ymmA);
  cc.vpblendw(ymmA, ymmB, ymmC, 0);
  cc.vpbroadcastb(xmmA, xmmB);
  cc.vpbroadcastb(ymmA, xmmB);
  cc.vpbroadcastd(xmmA, xmmB);
  cc.vpbroadcastd(ymmA, xmmB);
  cc.vpbroadcastq(xmmA, xmmB);
  cc.vpbroadcastq(ymmA, xmmB);
  cc.vpbroadcastw(xmmA, xmmB);
  cc.vpbroadcastw(ymmA, xmmB);
  cc.vpcmpeqb(ymmA, ymmB, ymmC);
  cc.vpcmpeqd(ymmA, ymmB, ymmC);
  cc.vpcmpeqq(ymmA, ymmB, ymmC);
  cc.vpcmpeqw(ymmA, ymmB, ymmC);
  cc.vpcmpgtb(ymmA, ymmB, ymmC);
  cc.vpcmpgtd(ymmA, ymmB, ymmC);
  cc.vpcmpgtq(ymmA, ymmB, ymmC);
  cc.vpcmpgtw(ymmA, ymmB, ymmC);
  cc.vperm2i128(ymmA, ymmB, ymmC, 0);
  cc.vpermd(ymmA, ymmB, ymmC);
  cc.vpermps(ymmA, ymmB, ymmC);
  cc.vpermpd(ymmA, ymmB, 0);
  cc.vpermq(ymmA, ymmB, 0);
  cc.vpmovmskb(gpd, ymmB);
  cc.vpmovsxbd(ymmA, xmmB);
  cc.vpmovsxbq(ymmA, xmmB);
  cc.vpmovsxbw(ymmA, xmmB);
  cc.vpmovsxdq(ymmA, xmmB);
  cc.vpmovsxwd(ymmA, xmmB);
  cc.vpmovsxwq(ymmA, xmmB);
  cc.vpmovzxbd(ymmA, xmmB);
  cc.vpmovzxbq(ymmA, xmmB);
  cc.vpmovzxbw(ymmA, xmmB);
  cc.vpmovzxdq(ymmA, xmmB);
  cc.vpmovzxwd(ymmA, xmmB);
  cc.vpmovzxwq(ymmA, xmmB);
  cc.vpshufd(ymmA, ymmB, 0);
  cc.vpshufhw(ymmA, ymmB, 0);
  cc.vpshuflw(ymmA, ymmB, 0);
  cc.vpslld(ymmA, ymmB, 0);
  cc.vpslldq(ymmA, ymmB, 0);
  cc.vpsllq(ymmA, ymmB, 0);
  cc.vpsllw(ymmA, ymmB, 0);
  cc.vpsrad(ymmA, ymmB, 0);
  cc.vpsraw(ymmA, ymmB, 0);
  cc.vpsrld(ymmA, ymmB, 0);
  cc.vpsrldq(ymmA, ymmB, 0);
  cc.vpsrlq(ymmA, ymmB, 0);
  cc.vpsrlw(ymmA, ymmB, 0);
  cc.vphaddd(ymmA, ymmB, ymmC);
  cc.vphaddsw(ymmA, ymmB, ymmC);
  cc.vphaddw(ymmA, ymmB, ymmC);
  cc.vphsubd(ymmA, ymmB, ymmC);
  cc.vphsubsw(ymmA, ymmB, ymmC);
  cc.vphsubw(ymmA, ymmB, ymmC);
  cc.vpmaddubsw(ymmA, ymmB, ymmC);
  cc.vpmaddwd(ymmA, ymmB, ymmC);
  cc.vpmaxsb(ymmA, ymmB, ymmC);
  cc.vpmaxsd(ymmA, ymmB, ymmC);
  cc.vpmaxsw(ymmA, ymmB, ymmC);
  cc.vpmaxub(ymmA, ymmB, ymmC);
  cc.vpmaxud(ymmA, ymmB, ymmC);
  cc.vpmaxuw(ymmA, ymmB, ymmC);
  cc.vpminsb(ymmA, ymmB, ymmC);
  cc.vpminsd(ymmA, ymmB, ymmC);
  cc.vpminsw(ymmA, ymmB, ymmC);
  cc.vpminub(ymmA, ymmB, ymmC);
  cc.vpminud(ymmA, ymmB, ymmC);
  cc.vpminuw(ymmA, ymmB, ymmC);
  cc.vpmuldq(ymmA, ymmB, ymmC);
  cc.vpmulhrsw(ymmA, ymmB, ymmC);
  cc.vpmulhuw(ymmA, ymmB, ymmC);
  cc.vpmulhw(ymmA, ymmB, ymmC);
  cc.vpmulld(ymmA, ymmB, ymmC);
  cc.vpmullw(ymmA, ymmB, ymmC);
  cc.vpmuludq(ymmA, ymmB, ymmC);
  cc.vpor(ymmA, ymmB, ymmC);
  cc.vpsadbw(ymmA, ymmB, ymmC);
  cc.vpshufb(ymmA, ymmB, ymmC);
  cc.vpsignb(ymmA, ymmB, ymmC);
  cc.vpsignd(ymmA, ymmB, ymmC);
  cc.vpsignw(ymmA, ymmB, ymmC);
  cc.vpslld(ymmA, ymmB, xmmC);
  cc.vpsllq(ymmA, ymmB, xmmC);
  cc.vpsllvd(xmmA, xmmB, xmmC);
  cc.vpsllvd(ymmA, ymmB, ymmC);
  cc.vpsllvq(xmmA, xmmB, xmmC);
  cc.vpsllvq(ymmA, ymmB, ymmC);
  cc.vpsllw(ymmA, ymmB, xmmC);
  cc.vpsrad(ymmA, ymmB, xmmC);
  cc.vpsravd(xmmA, xmmB, xmmC);
  cc.vpsravd(ymmA, ymmB, ymmC);
  cc.vpsraw(ymmA, ymmB, xmmC);
  cc.vpsrld(ymmA, ymmB, xmmC);
  cc.vpsrlq(ymmA, ymmB, xmmC);
  cc.vpsrlvd(xmmA, xmmB, xmmC);
  cc.vpsrlvd(ymmA, ymmB, ymmC);
  cc.vpsrlvq(xmmA, xmmB, xmmC);
  cc.vpsrlvq(ymmA, ymmB, ymmC);
  cc.vpsrlw(ymmA, ymmB, xmmC);
  cc.vpsubb(ymmA, ymmB, ymmC);
  cc.vpsubd(ymmA, ymmB, ymmC);
  cc.vpsubq(ymmA, ymmB, ymmC);
  cc.vpsubsb(ymmA, ymmB, ymmC);
  cc.vpsubsw(ymmA, ymmB, ymmC);
  cc.vpsubusb(ymmA, ymmB, ymmC);
  cc.vpsubusw(ymmA, ymmB, ymmC);
  cc.vpsubw(ymmA, ymmB, ymmC);
  cc.vpunpckhbw(ymmA, ymmB, ymmC);
  cc.vpunpckhdq(ymmA, ymmB, ymmC);
  cc.vpunpckhqdq(ymmA, ymmB, ymmC);
  cc.vpunpckhwd(ymmA, ymmB, ymmC);
  cc.vpunpcklbw(ymmA, ymmB, ymmC);
  cc.vpunpckldq(ymmA, ymmB, ymmC);
  cc.vpunpcklqdq(ymmA, ymmB, ymmC);
  cc.vpunpcklwd(ymmA, ymmB, ymmC);
  cc.vpxor(ymmA, ymmB, ymmC);

  // FMA.
  cc.vfmadd132pd(xmmA, xmmB, xmmC);
  cc.vfmadd132pd(ymmA, ymmB, ymmC);
  cc.vfmadd132ps(xmmA, xmmB, xmmC);
  cc.vfmadd132ps(ymmA, ymmB, ymmC);
  cc.vfmadd132sd(xmmA, xmmB, xmmC);
  cc.vfmadd132ss(xmmA, xmmB, xmmC);
  cc.vfmadd213pd(xmmA, xmmB, xmmC);
  cc.vfmadd213pd(ymmA, ymmB, ymmC);
  cc.vfmadd213ps(xmmA, xmmB, xmmC);
  cc.vfmadd213ps(ymmA, ymmB, ymmC);
  cc.vfmadd213sd(xmmA, xmmB, xmmC);
  cc.vfmadd213ss(xmmA, xmmB, xmmC);
  cc.vfmadd231pd(xmmA, xmmB, xmmC);
  cc.vfmadd231pd(ymmA, ymmB, ymmC);
  cc.vfmadd231ps(xmmA, xmmB, xmmC);
  cc.vfmadd231ps(ymmA, ymmB, ymmC);
  cc.vfmadd231sd(xmmA, xmmB, xmmC);
  cc.vfmadd231ss(xmmA, xmmB, xmmC);
  cc.vfmaddsub132pd(xmmA, xmmB, xmmC);
  cc.vfmaddsub132pd(ymmA, ymmB, ymmC);
  cc.vfmaddsub132ps(xmmA, xmmB, xmmC);
  cc.vfmaddsub132ps(ymmA, ymmB, ymmC);
  cc.vfmaddsub213pd(xmmA, xmmB, xmmC);
  cc.vfmaddsub213pd(ymmA, ymmB, ymmC);
  cc.vfmaddsub213ps(xmmA, xmmB, xmmC);
  cc.vfmaddsub213ps(ymmA, ymmB, ymmC);
  cc.vfmaddsub231pd(xmmA, xmmB, xmmC);
  cc.vfmaddsub231pd(ymmA, ymmB, ymmC);
  cc.vfmaddsub231ps(xmmA, xmmB, xmmC);
  cc.vfmaddsub231ps(ymmA, ymmB, ymmC);
  cc.vfmsub132pd(xmmA, xmmB, xmmC);
  cc.vfmsub132pd(ymmA, ymmB, ymmC);
  cc.vfmsub132ps(xmmA, xmmB, xmmC);
  cc.vfmsub132ps(ymmA, ymmB, ymmC);
  cc.vfmsub132sd(xmmA, xmmB, xmmC);
  cc.vfmsub132ss(xmmA, xmmB, xmmC);
  cc.vfmsub213pd(xmmA, xmmB, xmmC);
  cc.vfmsub213pd(ymmA, ymmB, ymmC);
  cc.vfmsub213ps(xmmA, xmmB, xmmC);
  cc.vfmsub213ps(ymmA, ymmB, ymmC);
  cc.vfmsub213sd(xmmA, xmmB, xmmC);
  cc.vfmsub213ss(xmmA, xmmB, xmmC);
  cc.vfmsub231pd(xmmA, xmmB, xmmC);
  cc.vfmsub231pd(ymmA, ymmB, ymmC);
  cc.vfmsub231ps(xmmA, xmmB, xmmC);
  cc.vfmsub231ps(ymmA, ymmB, ymmC);
  cc.vfmsub231sd(xmmA, xmmB, xmmC);
  cc.vfmsub231ss(xmmA, xmmB, xmmC);
  cc.vfmsubadd132pd(xmmA, xmmB, xmmC);
  cc.vfmsubadd132pd(ymmA, ymmB, ymmC);
  cc.vfmsubadd132ps(xmmA, xmmB, xmmC);
  cc.vfmsubadd132ps(ymmA, ymmB, ymmC);
  cc.vfmsubadd213pd(xmmA, xmmB, xmmC);
  cc.vfmsubadd213pd(ymmA, ymmB, ymmC);
  cc.vfmsubadd213ps(xmmA, xmmB, xmmC);
  cc.vfmsubadd213ps(ymmA, ymmB, ymmC);
  cc.vfmsubadd231pd(xmmA, xmmB, xmmC);
  cc.vfmsubadd231pd(ymmA, ymmB, ymmC);
  cc.vfmsubadd231ps(xmmA, xmmB, xmmC);
  cc.vfmsubadd231ps(ymmA, ymmB, ymmC);
  cc.vfnmadd132pd(xmmA, xmmB, xmmC);
  cc.vfnmadd132pd(ymmA, ymmB, ymmC);
  cc.vfnmadd132ps(xmmA, xmmB, xmmC);
  cc.vfnmadd132ps(ymmA, ymmB, ymmC);
  cc.vfnmadd132sd(xmmA, xmmB, xmmC);
  cc.vfnmadd132ss(xmmA, xmmB, xmmC);
  cc.vfnmadd213pd(xmmA, xmmB, xmmC);
  cc.vfnmadd213pd(ymmA, ymmB, ymmC);
  cc.vfnmadd213ps(xmmA, xmmB, xmmC);
  cc.vfnmadd213ps(ymmA, ymmB, ymmC);
  cc.vfnmadd213sd(xmmA, xmmB, xmmC);
  cc.vfnmadd213ss(xmmA, xmmB, xmmC);
  cc.vfnmadd231pd(xmmA, xmmB, xmmC);
  cc.vfnmadd231pd(ymmA, ymmB, ymmC);
  cc.vfnmadd231ps(xmmA, xmmB, xmmC);
  cc.vfnmadd231ps(ymmA, ymmB, ymmC);
  cc.vfnmadd231sd(xmmA, xmmB, xmmC);
  cc.vfnmadd231ss(xmmA, xmmB, xmmC);
  cc.vfnmsub132pd(xmmA, xmmB, xmmC);
  cc.vfnmsub132pd(ymmA, ymmB, ymmC);
  cc.vfnmsub132ps(xmmA, xmmB, xmmC);
  cc.vfnmsub132ps(ymmA, ymmB, ymmC);
  cc.vfnmsub132sd(xmmA, xmmB, xmmC);
  cc.vfnmsub132ss(xmmA, xmmB, xmmC);
  cc.vfnmsub213pd(xmmA, xmmB, xmmC);
  cc.vfnmsub213pd(ymmA, ymmB, ymmC);
  cc.vfnmsub213ps(xmmA, xmmB, xmmC);
  cc.vfnmsub213ps(ymmA, ymmB, ymmC);
  cc.vfnmsub213sd(xmmA, xmmB, xmmC);
  cc.vfnmsub213ss(xmmA, xmmB, xmmC);
  cc.vfnmsub231pd(xmmA, xmmB, xmmC);
  cc.vfnmsub231pd(ymmA, ymmB, ymmC);
  cc.vfnmsub231ps(xmmA, xmmB, xmmC);
  cc.vfnmsub231ps(ymmA, ymmB, ymmC);
  cc.vfnmsub231sd(xmmA, xmmB, xmmC);
  cc.vfnmsub231ss(xmmA, xmmB, xmmC);
}

// Generates a long sequence of AVX instructions.
template<typename Emitter>
static void generateAvxSequenceInternalRegMem(
  Emitter& cc,
  const x86::Gp& gp,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {

  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is32Bit() ? gpd : gpq;

  x86::Xmm xmmA = vecA.xmm();
  x86::Xmm xmmB = vecB.xmm();
  x86::Xmm xmmC = vecC.xmm();
  x86::Xmm xmmD = vecD.xmm();

  x86::Ymm ymmA = vecA.ymm();
  x86::Ymm ymmB = vecB.ymm();
  x86::Ymm ymmC = vecC.ymm();
  x86::Ymm ymmD = vecD.ymm();

  x86::Mem m = x86::ptr(gpz);
  x86::Mem m128 = x86::xmmword_ptr(gpz);
  x86::Mem m256 = x86::xmmword_ptr(gpz);
  x86::Mem vx_ptr = x86::ptr(gpz, xmmD);
  x86::Mem vy_ptr = x86::ptr(gpz, ymmD);

  cc.xor_(gpd, gpd);
  cc.vxorps(xmmA, xmmA, xmmA);
  cc.vxorps(xmmB, xmmB, xmmB);
  cc.vxorps(xmmC, xmmC, xmmC);
  cc.vxorps(xmmD, xmmD, xmmD);

  cc.vaddpd(xmmA, xmmB, m);
  cc.vaddpd(ymmA, ymmB, m);
  cc.vaddps(xmmA, xmmB, m);
  cc.vaddps(ymmA, ymmB, m);
  cc.vaddsd(xmmA, xmmB, m);
  cc.vaddss(xmmA, xmmB, m);
  cc.vaddsubpd(xmmA, xmmB, m);
  cc.vaddsubpd(ymmA, ymmB, m);
  cc.vaddsubps(xmmA, xmmB, m);
  cc.vaddsubps(ymmA, ymmB, m);
  cc.vandpd(xmmA, xmmB, m);
  cc.vandpd(ymmA, ymmB, m);
  cc.vandps(xmmA, xmmB, m);
  cc.vandps(ymmA, ymmB, m);
  cc.vandnpd(xmmA, xmmB, m);
  cc.vandnpd(ymmA, ymmB, m);
  cc.vandnps(xmmA, xmmB, m);
  cc.vandnps(ymmA, ymmB, m);
  cc.vblendpd(xmmA, xmmB, m, 0);
  cc.vblendpd(ymmA, ymmB, m, 0);
  cc.vblendps(xmmA, xmmB, m, 0);
  cc.vblendps(ymmA, ymmB, m, 0);
  cc.vblendvpd(xmmA, xmmB, m, xmmA);
  cc.vblendvpd(ymmA, ymmB, m, ymmA);
  cc.vbroadcastf128(ymmA, m);
  cc.vbroadcastsd(ymmA, m);
  cc.vbroadcastss(xmmA, m);
  cc.vbroadcastss(ymmA, m);
  cc.vcmppd(xmmA, xmmB, m, 0);
  cc.vcmppd(ymmA, ymmB, m, 0);
  cc.vcmpps(xmmA, xmmB, m, 0);
  cc.vcmpps(ymmA, ymmB, m, 0);
  cc.vcmpsd(xmmA, xmmB, m, 0);
  cc.vcmpss(xmmA, xmmB, m, 0);
  cc.vcomisd(xmmA, m);
  cc.vcomiss(xmmA, m);
  cc.vcvtdq2pd(xmmA, m);
  cc.vcvtdq2pd(ymmA, m);
  cc.vcvtdq2ps(xmmA, m);
  cc.vcvtdq2ps(ymmA, m);
  cc.vcvtpd2dq(xmmA, m128);
  cc.vcvtpd2dq(xmmA, m256);
  cc.vcvtpd2ps(xmmA, m128);
  cc.vcvtpd2ps(xmmA, m256);
  cc.vcvtps2dq(xmmA, m);
  cc.vcvtps2dq(ymmA, m);
  cc.vcvtps2pd(xmmA, m);
  cc.vcvtps2pd(ymmA, m);
  cc.vcvtsd2si(gpd, m);
  cc.vcvtsd2ss(xmmA, xmmB, m);
  cc.vcvtsi2sd(xmmA, xmmB, m);
  cc.vcvtsi2ss(xmmA, xmmB, m);
  cc.vcvtss2sd(xmmA, xmmB, m);
  cc.vcvtss2si(gpd, m);
  cc.vcvttpd2dq(xmmA, m128);
  cc.vcvttpd2dq(xmmA, m256);
  cc.vcvttps2dq(xmmA, m);
  cc.vcvttps2dq(ymmA, m);
  cc.vcvttsd2si(gpd, m);
  cc.vcvttss2si(gpd, m);
  cc.vdivpd(xmmA, xmmB, m);
  cc.vdivpd(ymmA, ymmB, m);
  cc.vdivps(xmmA, xmmB, m);
  cc.vdivps(ymmA, ymmB, m);
  cc.vdivsd(xmmA, xmmB, m);
  cc.vdivss(xmmA, xmmB, m);
  cc.vdppd(xmmA, xmmB, m, 0);
  cc.vdpps(xmmA, xmmB, m, 0);
  cc.vdpps(ymmA, ymmB, m, 0);
  cc.vextractf128(m, ymmB, 0);
  cc.vextractps(m, xmmB, 0);
  cc.vhaddpd(xmmA, xmmB, m);
  cc.vhaddpd(ymmA, ymmB, m);
  cc.vhaddps(xmmA, xmmB, m);
  cc.vhaddps(ymmA, ymmB, m);
  cc.vhsubpd(xmmA, xmmB, m);
  cc.vhsubpd(ymmA, ymmB, m);
  cc.vhsubps(xmmA, xmmB, m);
  cc.vhsubps(ymmA, ymmB, m);
  cc.vinsertf128(ymmA, ymmB, m, 0);
  cc.vinsertps(xmmA, xmmB, m, 0);
  cc.vlddqu(xmmA, m);
  cc.vlddqu(ymmA, m);
  cc.vmaskmovps(xmmA, xmmB, m);
  cc.vmaskmovps(ymmA, ymmB, m);
  cc.vmaskmovps(m, xmmB, xmmC);
  cc.vmaskmovps(m, ymmB, ymmC);
  cc.vmaskmovpd(xmmA, xmmB, m);
  cc.vmaskmovpd(ymmA, ymmB, m);
  cc.vmaskmovpd(m, xmmB, xmmC);
  cc.vmaskmovpd(m, ymmB, ymmC);
  cc.vmaxpd(xmmA, xmmB, m);
  cc.vmaxpd(ymmA, ymmB, m);
  cc.vmaxps(xmmA, xmmB, m);
  cc.vmaxps(ymmA, ymmB, m);
  cc.vmaxsd(xmmA, xmmB, m);
  cc.vmaxss(xmmA, xmmB, m);
  cc.vminpd(xmmA, xmmB, m);
  cc.vminpd(ymmA, ymmB, m);
  cc.vminps(xmmA, xmmB, m);
  cc.vminps(ymmA, ymmB, m);
  cc.vminsd(xmmA, xmmB, m);
  cc.vminss(xmmA, xmmB, m);
  cc.vmovapd(xmmA, m);
  cc.vmovapd(m, xmmB);
  cc.vmovapd(ymmA, m);
  cc.vmovapd(m, ymmB);
  cc.vmovaps(xmmA, m);
  cc.vmovaps(m, xmmB);
  cc.vmovaps(ymmA, m);
  cc.vmovaps(m, ymmB);
  cc.vmovd(xmmA, m);
  cc.vmovd(m, xmmB);
  cc.vmovddup(xmmA, m);
  cc.vmovddup(ymmA, m);
  cc.vmovdqa(xmmA, m);
  cc.vmovdqa(m, xmmB);
  cc.vmovdqa(ymmA, m);
  cc.vmovdqa(m, ymmB);
  cc.vmovdqu(xmmA, m);
  cc.vmovdqu(m, xmmB);
  cc.vmovdqu(ymmA, m);
  cc.vmovdqu(m, ymmB);
  cc.vmovhpd(xmmA, xmmB, m);
  cc.vmovhps(xmmA, xmmB, m);
  cc.vmovhps(m, xmmB);
  cc.vmovlpd(xmmA, xmmB, m);
  cc.vmovlpd(m, xmmB);
  cc.vmovlps(xmmA, xmmB, m);
  cc.vmovlps(m, xmmB);
  cc.vmovntdq(m, xmmB);
  cc.vmovntdq(m, ymmB);
  cc.vmovntdqa(xmmA, m);
  cc.vmovntpd(m, xmmB);
  cc.vmovntpd(m, ymmB);
  cc.vmovntps(m, xmmB);
  cc.vmovntps(m, ymmB);
  cc.vmovsd(xmmA, m);
  cc.vmovsd(m, xmmB);
  cc.vmovshdup(xmmA, m);
  cc.vmovshdup(ymmA, m);
  cc.vmovsldup(xmmA, m);
  cc.vmovsldup(ymmA, m);
  cc.vmovss(xmmA, m);
  cc.vmovss(m, xmmB);
  cc.vmovupd(xmmA, m);
  cc.vmovupd(m, xmmB);
  cc.vmovupd(ymmA, m);
  cc.vmovupd(m, ymmB);
  cc.vmovups(xmmA, m);
  cc.vmovups(m, xmmB);
  cc.vmovups(ymmA, m);
  cc.vmovups(m, ymmB);
  cc.vmpsadbw(xmmA, xmmB, m, 0);
  cc.vmulpd(xmmA, xmmB, m);
  cc.vmulpd(ymmA, ymmB, m);
  cc.vmulps(xmmA, xmmB, m);
  cc.vmulps(ymmA, ymmB, m);
  cc.vmulsd(xmmA, xmmB, m);
  cc.vmulss(xmmA, xmmB, m);
  cc.vorpd(xmmA, xmmB, m);
  cc.vorpd(ymmA, ymmB, m);
  cc.vorps(xmmA, xmmB, m);
  cc.vorps(ymmA, ymmB, m);
  cc.vpabsb(xmmA, m);
  cc.vpabsd(xmmA, m);
  cc.vpabsw(xmmA, m);
  cc.vpackssdw(xmmA, xmmB, m);
  cc.vpacksswb(xmmA, xmmB, m);
  cc.vpackusdw(xmmA, xmmB, m);
  cc.vpackuswb(xmmA, xmmB, m);
  cc.vpaddb(xmmA, xmmB, m);
  cc.vpaddd(xmmA, xmmB, m);
  cc.vpaddq(xmmA, xmmB, m);
  cc.vpaddw(xmmA, xmmB, m);
  cc.vpaddsb(xmmA, xmmB, m);
  cc.vpaddsw(xmmA, xmmB, m);
  cc.vpaddusb(xmmA, xmmB, m);
  cc.vpaddusw(xmmA, xmmB, m);
  cc.vpalignr(xmmA, xmmB, m, 0);
  cc.vpand(xmmA, xmmB, m);
  cc.vpandn(xmmA, xmmB, m);
  cc.vpavgb(xmmA, xmmB, m);
  cc.vpavgw(xmmA, xmmB, m);
  cc.vpblendvb(xmmA, xmmB, m, xmmA);
  cc.vpblendw(xmmA, xmmB, m, 0);
  cc.vpcmpeqb(xmmA, xmmB, m);
  cc.vpcmpeqd(xmmA, xmmB, m);
  cc.vpcmpeqq(xmmA, xmmB, m);
  cc.vpcmpeqw(xmmA, xmmB, m);
  cc.vpcmpgtb(xmmA, xmmB, m);
  cc.vpcmpgtd(xmmA, xmmB, m);
  cc.vpcmpgtq(xmmA, xmmB, m);
  cc.vpcmpgtw(xmmA, xmmB, m);
  cc.vpermilpd(xmmA, xmmB, m);
  cc.vpermilpd(ymmA, ymmB, m);
  cc.vpermilpd(xmmA, m, 0);
  cc.vpermilpd(ymmA, m, 0);
  cc.vpermilps(xmmA, xmmB, m);
  cc.vpermilps(ymmA, ymmB, m);
  cc.vpermilps(xmmA, m, 0);
  cc.vpermilps(ymmA, m, 0);
  cc.vperm2f128(ymmA, ymmB, m, 0);
  cc.vpextrb(m, xmmB, 0);
  cc.vpextrd(m, xmmB, 0);
  if (cc.is64Bit()) cc.vpextrq(m, xmmB, 0);
  cc.vpextrw(m, xmmB, 0);
  cc.vphaddd(xmmA, xmmB, m);
  cc.vphaddsw(xmmA, xmmB, m);
  cc.vphaddw(xmmA, xmmB, m);
  cc.vphminposuw(xmmA, m);
  cc.vphsubd(xmmA, xmmB, m);
  cc.vphsubsw(xmmA, xmmB, m);
  cc.vphsubw(xmmA, xmmB, m);
  cc.vpinsrb(xmmA, xmmB, m, 0);
  cc.vpinsrd(xmmA, xmmB, m, 0);
  cc.vpinsrw(xmmA, xmmB, m, 0);
  cc.vpmaddubsw(xmmA, xmmB, m);
  cc.vpmaddwd(xmmA, xmmB, m);
  cc.vpmaxsb(xmmA, xmmB, m);
  cc.vpmaxsd(xmmA, xmmB, m);
  cc.vpmaxsw(xmmA, xmmB, m);
  cc.vpmaxub(xmmA, xmmB, m);
  cc.vpmaxud(xmmA, xmmB, m);
  cc.vpmaxuw(xmmA, xmmB, m);
  cc.vpminsb(xmmA, xmmB, m);
  cc.vpminsd(xmmA, xmmB, m);
  cc.vpminsw(xmmA, xmmB, m);
  cc.vpminub(xmmA, xmmB, m);
  cc.vpminud(xmmA, xmmB, m);
  cc.vpminuw(xmmA, xmmB, m);
  cc.vpmovsxbd(xmmA, m);
  cc.vpmovsxbq(xmmA, m);
  cc.vpmovsxbw(xmmA, m);
  cc.vpmovsxdq(xmmA, m);
  cc.vpmovsxwd(xmmA, m);
  cc.vpmovsxwq(xmmA, m);
  cc.vpmovzxbd(xmmA, m);
  cc.vpmovzxbq(xmmA, m);
  cc.vpmovzxbw(xmmA, m);
  cc.vpmovzxdq(xmmA, m);
  cc.vpmovzxwd(xmmA, m);
  cc.vpmovzxwq(xmmA, m);
  cc.vpmuldq(xmmA, xmmB, m);
  cc.vpmulhrsw(xmmA, xmmB, m);
  cc.vpmulhuw(xmmA, xmmB, m);
  cc.vpmulhw(xmmA, xmmB, m);
  cc.vpmulld(xmmA, xmmB, m);
  cc.vpmullw(xmmA, xmmB, m);
  cc.vpmuludq(xmmA, xmmB, m);
  cc.vpor(xmmA, xmmB, m);
  cc.vpsadbw(xmmA, xmmB, m);
  cc.vpshufb(xmmA, xmmB, m);
  cc.vpshufd(xmmA, m, 0);
  cc.vpshufhw(xmmA, m, 0);
  cc.vpshuflw(xmmA, m, 0);
  cc.vpsignb(xmmA, xmmB, m);
  cc.vpsignd(xmmA, xmmB, m);
  cc.vpsignw(xmmA, xmmB, m);
  cc.vpslld(xmmA, xmmB, m);
  cc.vpsllq(xmmA, xmmB, m);
  cc.vpsllw(xmmA, xmmB, m);
  cc.vpsrad(xmmA, xmmB, m);
  cc.vpsraw(xmmA, xmmB, m);
  cc.vpsrld(xmmA, xmmB, m);
  cc.vpsrlq(xmmA, xmmB, m);
  cc.vpsrlw(xmmA, xmmB, m);
  cc.vpsubb(xmmA, xmmB, m);
  cc.vpsubd(xmmA, xmmB, m);
  cc.vpsubq(xmmA, xmmB, m);
  cc.vpsubw(xmmA, xmmB, m);
  cc.vpsubsb(xmmA, xmmB, m);
  cc.vpsubsw(xmmA, xmmB, m);
  cc.vpsubusb(xmmA, xmmB, m);
  cc.vpsubusw(xmmA, xmmB, m);
  cc.vptest(xmmA, m);
  cc.vptest(ymmA, m);
  cc.vpunpckhbw(xmmA, xmmB, m);
  cc.vpunpckhdq(xmmA, xmmB, m);
  cc.vpunpckhqdq(xmmA, xmmB, m);
  cc.vpunpckhwd(xmmA, xmmB, m);
  cc.vpunpcklbw(xmmA, xmmB, m);
  cc.vpunpckldq(xmmA, xmmB, m);
  cc.vpunpcklqdq(xmmA, xmmB, m);
  cc.vpunpcklwd(xmmA, xmmB, m);
  cc.vpxor(xmmA, xmmB, m);
  cc.vrcpps(xmmA, m);
  cc.vrcpps(ymmA, m);
  cc.vrcpss(xmmA, xmmB, m);
  cc.vrsqrtps(xmmA, m);
  cc.vrsqrtps(ymmA, m);
  cc.vrsqrtss(xmmA, xmmB, m);
  cc.vroundpd(xmmA, m, 0);
  cc.vroundpd(ymmA, m, 0);
  cc.vroundps(xmmA, m, 0);
  cc.vroundps(ymmA, m, 0);
  cc.vroundsd(xmmA, xmmB, m, 0);
  cc.vroundss(xmmA, xmmB, m, 0);
  cc.vshufpd(xmmA, xmmB, m, 0);
  cc.vshufpd(ymmA, ymmB, m, 0);
  cc.vshufps(xmmA, xmmB, m, 0);
  cc.vshufps(ymmA, ymmB, m, 0);
  cc.vsqrtpd(xmmA, m);
  cc.vsqrtpd(ymmA, m);
  cc.vsqrtps(xmmA, m);
  cc.vsqrtps(ymmA, m);
  cc.vsqrtsd(xmmA, xmmB, m);
  cc.vsqrtss(xmmA, xmmB, m);
  cc.vsubpd(xmmA, xmmB, m);
  cc.vsubpd(ymmA, ymmB, m);
  cc.vsubps(xmmA, xmmB, m);
  cc.vsubps(ymmA, ymmB, m);
  cc.vsubsd(xmmA, xmmB, m);
  cc.vsubss(xmmA, xmmB, m);
  cc.vtestps(xmmA, m);
  cc.vtestps(ymmA, m);
  cc.vtestpd(xmmA, m);
  cc.vtestpd(ymmA, m);
  cc.vucomisd(xmmA, m);
  cc.vucomiss(xmmA, m);
  cc.vunpckhpd(xmmA, xmmB, m);
  cc.vunpckhpd(ymmA, ymmB, m);
  cc.vunpckhps(xmmA, xmmB, m);
  cc.vunpckhps(ymmA, ymmB, m);
  cc.vunpcklpd(xmmA, xmmB, m);
  cc.vunpcklpd(ymmA, ymmB, m);
  cc.vunpcklps(xmmA, xmmB, m);
  cc.vunpcklps(ymmA, ymmB, m);
  cc.vxorpd(xmmA, xmmB, m);
  cc.vxorpd(ymmA, ymmB, m);
  cc.vxorps(xmmA, xmmB, m);
  cc.vxorps(ymmA, ymmB, m);

  // AVX+AESNI.
  cc.vaesdec(xmmA, xmmB, m);
  cc.vaesdeclast(xmmA, xmmB, m);
  cc.vaesenc(xmmA, xmmB, m);
  cc.vaesenclast(xmmA, xmmB, m);
  cc.vaesimc(xmmA, m);
  cc.vaeskeygenassist(xmmA, m, 0);

  // AVX+PCLMULQDQ.
  cc.vpclmulqdq(xmmA, xmmB, m, 0);

  // AVX2.
  cc.vbroadcasti128(ymmA, m);
  cc.vextracti128(m, ymmB, 0);
  cc.vgatherdpd(xmmA, vx_ptr, xmmC);
  cc.vgatherdpd(ymmA, vx_ptr, ymmC);
  cc.vgatherdps(xmmA, vx_ptr, xmmC);
  cc.vgatherdps(ymmA, vy_ptr, ymmC);
  cc.vgatherqpd(xmmA, vx_ptr, xmmC);
  cc.vgatherqpd(ymmA, vy_ptr, ymmC);
  cc.vgatherqps(xmmA, vx_ptr, xmmC);
  cc.vgatherqps(xmmA, vy_ptr, xmmC);
  cc.vinserti128(ymmA, ymmB, m, 0);
  cc.vmovntdqa(ymmA, m);
  cc.vmpsadbw(ymmA, ymmB, m, 0);
  cc.vpabsb(ymmA, m);
  cc.vpabsd(ymmA, m);
  cc.vpabsw(ymmA, m);
  cc.vpackssdw(ymmA, ymmB, m);
  cc.vpacksswb(ymmA, ymmB, m);
  cc.vpackusdw(ymmA, ymmB, m);
  cc.vpackuswb(ymmA, ymmB, m);
  cc.vpaddb(ymmA, ymmB, m);
  cc.vpaddd(ymmA, ymmB, m);
  cc.vpaddq(ymmA, ymmB, m);
  cc.vpaddw(ymmA, ymmB, m);
  cc.vpaddsb(ymmA, ymmB, m);
  cc.vpaddsw(ymmA, ymmB, m);
  cc.vpaddusb(ymmA, ymmB, m);
  cc.vpaddusw(ymmA, ymmB, m);
  cc.vpalignr(ymmA, ymmB, m, 0);
  cc.vpand(ymmA, ymmB, m);
  cc.vpandn(ymmA, ymmB, m);
  cc.vpavgb(ymmA, ymmB, m);
  cc.vpavgw(ymmA, ymmB, m);
  cc.vpblendd(xmmA, xmmB, m, 0);
  cc.vpblendd(ymmA, ymmB, m, 0);
  cc.vpblendvb(ymmA, ymmB, m, ymmA);
  cc.vpblendw(ymmA, ymmB, m, 0);
  cc.vpbroadcastb(xmmA, m);
  cc.vpbroadcastb(ymmA, m);
  cc.vpbroadcastd(xmmA, m);
  cc.vpbroadcastd(ymmA, m);
  cc.vpbroadcastq(xmmA, m);
  cc.vpbroadcastq(ymmA, m);
  cc.vpbroadcastw(xmmA, m);
  cc.vpbroadcastw(ymmA, m);
  cc.vpcmpeqb(ymmA, ymmB, m);
  cc.vpcmpeqd(ymmA, ymmB, m);
  cc.vpcmpeqq(ymmA, ymmB, m);
  cc.vpcmpeqw(ymmA, ymmB, m);
  cc.vpcmpgtb(ymmA, ymmB, m);
  cc.vpcmpgtd(ymmA, ymmB, m);
  cc.vpcmpgtq(ymmA, ymmB, m);
  cc.vpcmpgtw(ymmA, ymmB, m);
  cc.vperm2i128(ymmA, ymmB, m, 0);
  cc.vpermd(ymmA, ymmB, m);
  cc.vpermps(ymmA, ymmB, m);
  cc.vpermpd(ymmA, m, 0);
  cc.vpermq(ymmA, m, 0);
  cc.vpgatherdd(xmmA, vx_ptr, xmmC);
  cc.vpgatherdd(ymmA, vy_ptr, ymmC);
  cc.vpgatherdq(xmmA, vx_ptr, xmmC);
  cc.vpgatherdq(ymmA, vx_ptr, ymmC);
  cc.vpgatherqd(xmmA, vx_ptr, xmmC);
  cc.vpgatherqd(xmmA, vy_ptr, xmmC);
  cc.vpgatherqq(xmmA, vx_ptr, xmmC);
  cc.vpgatherqq(ymmA, vy_ptr, ymmC);
  cc.vpmovsxbd(ymmA, m);
  cc.vpmovsxbq(ymmA, m);
  cc.vpmovsxbw(ymmA, m);
  cc.vpmovsxdq(ymmA, m);
  cc.vpmovsxwd(ymmA, m);
  cc.vpmovsxwq(ymmA, m);
  cc.vpmovzxbd(ymmA, m);
  cc.vpmovzxbq(ymmA, m);
  cc.vpmovzxbw(ymmA, m);
  cc.vpmovzxdq(ymmA, m);
  cc.vpmovzxwd(ymmA, m);
  cc.vpmovzxwq(ymmA, m);
  cc.vpshufd(ymmA, m, 0);
  cc.vpshufhw(ymmA, m, 0);
  cc.vpshuflw(ymmA, m, 0);
  cc.vphaddd(ymmA, ymmB, m);
  cc.vphaddsw(ymmA, ymmB, m);
  cc.vphaddw(ymmA, ymmB, m);
  cc.vphsubd(ymmA, ymmB, m);
  cc.vphsubsw(ymmA, ymmB, m);
  cc.vphsubw(ymmA, ymmB, m);
  cc.vpmaddubsw(ymmA, ymmB, m);
  cc.vpmaddwd(ymmA, ymmB, m);
  cc.vpmaskmovd(m, xmmB, xmmC);
  cc.vpmaskmovd(m, ymmB, ymmC);
  cc.vpmaskmovd(xmmA, xmmB, m);
  cc.vpmaskmovd(ymmA, ymmB, m);
  cc.vpmaskmovq(m, xmmB, xmmC);
  cc.vpmaskmovq(m, ymmB, ymmC);
  cc.vpmaskmovq(xmmA, xmmB, m);
  cc.vpmaskmovq(ymmA, ymmB, m);
  cc.vpmaxsb(ymmA, ymmB, m);
  cc.vpmaxsd(ymmA, ymmB, m);
  cc.vpmaxsw(ymmA, ymmB, m);
  cc.vpmaxub(ymmA, ymmB, m);
  cc.vpmaxud(ymmA, ymmB, m);
  cc.vpmaxuw(ymmA, ymmB, m);
  cc.vpminsb(ymmA, ymmB, m);
  cc.vpminsd(ymmA, ymmB, m);
  cc.vpminsw(ymmA, ymmB, m);
  cc.vpminub(ymmA, ymmB, m);
  cc.vpminud(ymmA, ymmB, m);
  cc.vpminuw(ymmA, ymmB, m);
  cc.vpmuldq(ymmA, ymmB, m);
  cc.vpmulhrsw(ymmA, ymmB, m);
  cc.vpmulhuw(ymmA, ymmB, m);
  cc.vpmulhw(ymmA, ymmB, m);
  cc.vpmulld(ymmA, ymmB, m);
  cc.vpmullw(ymmA, ymmB, m);
  cc.vpmuludq(ymmA, ymmB, m);
  cc.vpor(ymmA, ymmB, m);
  cc.vpsadbw(ymmA, ymmB, m);
  cc.vpshufb(ymmA, ymmB, m);
  cc.vpsignb(ymmA, ymmB, m);
  cc.vpsignd(ymmA, ymmB, m);
  cc.vpsignw(ymmA, ymmB, m);
  cc.vpslld(ymmA, ymmB, m);
  cc.vpsllq(ymmA, ymmB, m);
  cc.vpsllvd(xmmA, xmmB, m);
  cc.vpsllvd(ymmA, ymmB, m);
  cc.vpsllvq(xmmA, xmmB, m);
  cc.vpsllvq(ymmA, ymmB, m);
  cc.vpsllw(ymmA, ymmB, m);
  cc.vpsrad(ymmA, ymmB, m);
  cc.vpsravd(xmmA, xmmB, m);
  cc.vpsravd(ymmA, ymmB, m);
  cc.vpsraw(ymmA, ymmB, m);
  cc.vpsrld(ymmA, ymmB, m);
  cc.vpsrlq(ymmA, ymmB, m);
  cc.vpsrlvd(xmmA, xmmB, m);
  cc.vpsrlvd(ymmA, ymmB, m);
  cc.vpsrlvq(xmmA, xmmB, m);
  cc.vpsrlvq(ymmA, ymmB, m);
  cc.vpsrlw(ymmA, ymmB, m);
  cc.vpsubb(ymmA, ymmB, m);
  cc.vpsubd(ymmA, ymmB, m);
  cc.vpsubq(ymmA, ymmB, m);
  cc.vpsubsb(ymmA, ymmB, m);
  cc.vpsubsw(ymmA, ymmB, m);
  cc.vpsubusb(ymmA, ymmB, m);
  cc.vpsubusw(ymmA, ymmB, m);
  cc.vpsubw(ymmA, ymmB, m);
  cc.vpunpckhbw(ymmA, ymmB, m);
  cc.vpunpckhdq(ymmA, ymmB, m);
  cc.vpunpckhqdq(ymmA, ymmB, m);
  cc.vpunpckhwd(ymmA, ymmB, m);
  cc.vpunpcklbw(ymmA, ymmB, m);
  cc.vpunpckldq(ymmA, ymmB, m);
  cc.vpunpcklqdq(ymmA, ymmB, m);
  cc.vpunpcklwd(ymmA, ymmB, m);
  cc.vpxor(ymmA, ymmB, m);
}

// Generates a long sequence of AVX instructions.
template<typename Emitter>
static void generateAvxSequenceInternal(
  Emitter& cc,
  InstForm form,
  const x86::Gp& gp,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {

  if (form == InstForm::kReg)
    generateAvxSequenceInternalRegOnly(cc, gp, vecA, vecB, vecC, vecD);
  else
    generateAvxSequenceInternalRegMem(cc, gp, vecA, vecB, vecC, vecD);
}

static void generateAvxSequence(BaseEmitter& emitter, InstForm form, bool emitPrologEpilog) {
  using namespace asmjit::x86;

  if (emitter.isAssembler()) {
    Assembler& cc = *emitter.as<Assembler>();

    if (emitPrologEpilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.addDirtyRegs(eax, ymm0, ymm1, ymm2, ymm3);
      frame.finalize();

      cc.emitProlog(frame);
      generateAvxSequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
      cc.emitEpilog(frame);
    }
    else {
      generateAvxSequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
    }
  }
#ifndef ASMJIT_NO_BUILDER
  else if (emitter.isBuilder()) {
    Builder& cc = *emitter.as<Builder>();

    if (emitPrologEpilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.addDirtyRegs(eax, ymm0, ymm1, ymm2, ymm3);
      frame.finalize();

      cc.emitProlog(frame);
      generateAvxSequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
      cc.emitEpilog(frame);
    }
    else {
      generateAvxSequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
    }
  }
#endif
#ifndef ASMJIT_NO_COMPILER
  else if (emitter.isCompiler()) {
    Compiler& cc = *emitter.as<Compiler>();

    Gp gp = cc.newGpz("gp");
    Ymm a = cc.newYmm("a");
    Ymm b = cc.newYmm("b");
    Ymm c = cc.newYmm("c");
    Ymm d = cc.newYmm("d");

    cc.addFunc(FuncSignature::build<void>());
    generateAvxSequenceInternal(cc, form, gp, a, b, c, d);
    cc.endFunc();
  }
#endif
}

// Generates a long sequence of AVX512 instructions.
template<typename Emitter>
static void generateAvx512SequenceInternalRegOnly(
  Emitter& cc,
  const x86::Gp& gp,
  const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {

  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is32Bit() ? gpd : gpq;

  x86::Xmm xmmA = vecA.xmm();
  x86::Xmm xmmB = vecB.xmm();
  x86::Xmm xmmC = vecC.xmm();
  x86::Xmm xmmD = vecD.xmm();

  x86::Ymm ymmA = vecA.ymm();
  x86::Ymm ymmB = vecB.ymm();
  x86::Ymm ymmC = vecC.ymm();

  x86::Zmm zmmA = vecA.zmm();
  x86::Zmm zmmB = vecB.zmm();
  x86::Zmm zmmC = vecC.zmm();

  cc.xor_(gpd, gpd);
  cc.vxorps(xmmA, xmmA, xmmA);
  cc.vxorps(xmmB, xmmB, xmmB);
  cc.vxorps(xmmC, xmmC, xmmC);
  cc.vxorps(xmmD, xmmD, xmmD);

  cc.kaddb(kA, kB, kC);
  cc.kaddd(kA, kB, kC);
  cc.kaddq(kA, kB, kC);
  cc.kaddw(kA, kB, kC);
  cc.kandb(kA, kB, kC);
  cc.kandd(kA, kB, kC);
  cc.kandnb(kA, kB, kC);
  cc.kandnd(kA, kB, kC);
  cc.kandnq(kA, kB, kC);
  cc.kandnw(kA, kB, kC);
  cc.kandq(kA, kB, kC);
  cc.kandw(kA, kB, kC);
  cc.kmovb(kA, kB);
  cc.kmovb(kA, gpd);
  cc.kmovb(gpd, kB);
  cc.kmovd(kA, kB);
  cc.kmovd(kA, gpd);
  cc.kmovd(gpd, kB);
  cc.kmovq(kA, kB);
  if (cc.is64Bit()) cc.kmovq(kA, gpq);
  if (cc.is64Bit()) cc.kmovq(gpq, kB);
  cc.kmovw(kA, kB);
  cc.kmovw(kA, gpd);
  cc.kmovw(gpd, kB);
  cc.knotb(kA, kB);
  cc.knotd(kA, kB);
  cc.knotq(kA, kB);
  cc.knotw(kA, kB);
  cc.korb(kA, kB, kC);
  cc.kord(kA, kB, kC);
  cc.korq(kA, kB, kC);
  cc.kortestb(kA, kB);
  cc.kortestd(kA, kB);
  cc.kortestq(kA, kB);
  cc.kortestw(kA, kB);
  cc.korw(kA, kB, kC);
  cc.kshiftlb(kA, kB, 0);
  cc.kshiftld(kA, kB, 0);
  cc.kshiftlq(kA, kB, 0);
  cc.kshiftlw(kA, kB, 0);
  cc.kshiftrb(kA, kB, 0);
  cc.kshiftrd(kA, kB, 0);
  cc.kshiftrq(kA, kB, 0);
  cc.kshiftrw(kA, kB, 0);
  cc.ktestb(kA, kB);
  cc.ktestd(kA, kB);
  cc.ktestq(kA, kB);
  cc.ktestw(kA, kB);
  cc.kunpckbw(kA, kB, kC);
  cc.kunpckdq(kA, kB, kC);
  cc.kunpckwd(kA, kB, kC);
  cc.kxnorb(kA, kB, kC);
  cc.kxnord(kA, kB, kC);
  cc.kxnorq(kA, kB, kC);
  cc.kxnorw(kA, kB, kC);
  cc.kxorb(kA, kB, kC);
  cc.kxord(kA, kB, kC);
  cc.kxorq(kA, kB, kC);
  cc.kxorw(kA, kB, kC);
  cc.nop();

  cc.evex().vaddpd(xmmA, xmmB, xmmC);
  cc.evex().vaddpd(ymmA, ymmB, ymmC);
  cc.evex().vaddpd(zmmA, zmmB, zmmC);
  cc.evex().vaddps(xmmA, xmmB, xmmC);
  cc.evex().vaddps(ymmA, ymmB, ymmC);
  cc.evex().vaddps(zmmA, zmmB, zmmC);
  cc.evex().vaddsd(xmmA, xmmB, xmmC);
  cc.evex().vaddss(xmmA, xmmB, xmmC);
  cc.evex().valignd(xmmA, xmmB, xmmC, 0);
  cc.evex().valignd(ymmA, ymmB, ymmC, 0);
  cc.evex().valignd(zmmA, zmmB, zmmC, 0);
  cc.evex().valignq(xmmA, xmmB, xmmC, 0);
  cc.evex().valignq(ymmA, ymmB, ymmC, 0);
  cc.evex().valignq(zmmA, zmmB, zmmC, 0);
  cc.evex().vandnpd(xmmA, xmmB, xmmC);
  cc.evex().vandnpd(ymmA, ymmB, ymmC);
  cc.evex().vandnpd(zmmA, zmmB, zmmC);
  cc.evex().vandnps(xmmA, xmmB, xmmC);
  cc.evex().vandnps(ymmA, ymmB, ymmC);
  cc.evex().vandnps(zmmA, zmmB, zmmC);
  cc.evex().vandpd(xmmA, xmmB, xmmC);
  cc.evex().vandpd(ymmA, ymmB, ymmC);
  cc.evex().vandpd(zmmA, zmmB, zmmC);
  cc.evex().vandps(xmmA, xmmB, xmmC);
  cc.evex().vandps(ymmA, ymmB, ymmC);
  cc.evex().vandps(zmmA, zmmB, zmmC);
  cc.evex().vblendmpd(xmmA, xmmB, xmmC);
  cc.evex().vblendmpd(ymmA, ymmB, ymmC);
  cc.evex().vblendmpd(zmmA, zmmB, zmmC);
  cc.evex().vblendmps(xmmA, xmmB, xmmC);
  cc.evex().vblendmps(ymmA, ymmB, ymmC);
  cc.evex().vblendmps(zmmA, zmmB, zmmC);
  cc.evex().vbroadcastf32x2(ymmA, xmmB);
  cc.evex().vbroadcastf32x2(zmmA, xmmB);
  cc.evex().vbroadcasti32x2(xmmA, xmmB);
  cc.evex().vbroadcasti32x2(ymmA, xmmB);
  cc.evex().vbroadcasti32x2(zmmA, xmmB);
  cc.evex().vbroadcastsd(ymmA, xmmB);
  cc.evex().vbroadcastsd(zmmA, xmmB);
  cc.evex().vbroadcastss(xmmA, xmmB);
  cc.evex().vbroadcastss(ymmA, xmmB);
  cc.evex().vbroadcastss(zmmA, xmmB);
  cc.evex().vcmppd(kA, xmmB, xmmC, 0);
  cc.evex().vcmppd(kA, ymmB, ymmC, 0);
  cc.evex().vcmppd(kA, zmmB, zmmC, 0);
  cc.evex().vcmpps(kA, xmmB, xmmC, 0);
  cc.evex().vcmpps(kA, ymmB, ymmC, 0);
  cc.evex().vcmpps(kA, zmmB, zmmC, 0);
  cc.evex().vcmpsd(kA, xmmB, xmmC, 0);
  cc.evex().vcmpss(kA, xmmB, xmmC, 0);
  cc.evex().vcomisd(xmmA, xmmB);
  cc.evex().vcomiss(xmmA, xmmB);
  cc.evex().vcompresspd(xmmA, xmmB);
  cc.evex().vcompresspd(ymmA, ymmB);
  cc.evex().vcompresspd(zmmA, zmmB);
  cc.evex().vcompressps(xmmA, xmmB);
  cc.evex().vcompressps(ymmA, ymmB);
  cc.evex().vcompressps(zmmA, zmmB);
  cc.evex().vcvtdq2pd(xmmA, xmmB);
  cc.evex().vcvtdq2pd(ymmA, xmmB);
  cc.evex().vcvtdq2pd(zmmA, ymmB);
  cc.evex().vcvtdq2ps(xmmA, xmmB);
  cc.evex().vcvtdq2ps(ymmA, ymmB);
  cc.evex().vcvtdq2ps(zmmA, zmmB);
  cc.evex().vcvtpd2dq(xmmA, xmmB);
  cc.evex().vcvtpd2dq(xmmA, ymmB);
  cc.evex().vcvtpd2dq(ymmA, zmmB);
  cc.evex().vcvtpd2qq(xmmA, xmmB);
  cc.evex().vcvtpd2qq(ymmA, ymmB);
  cc.evex().vcvtpd2qq(zmmA, zmmB);
  cc.evex().vcvtpd2udq(xmmA, xmmB);
  cc.evex().vcvtpd2udq(xmmA, ymmB);
  cc.evex().vcvtpd2udq(ymmA, zmmB);
  cc.evex().vcvtpd2uqq(xmmA, xmmB);
  cc.evex().vcvtpd2uqq(ymmA, ymmB);
  cc.evex().vcvtpd2uqq(zmmA, zmmB);
  cc.evex().vcvtph2ps(xmmA, xmmB);
  cc.evex().vcvtph2ps(ymmA, xmmB);
  cc.evex().vcvtph2ps(zmmA, ymmB);
  cc.evex().vcvtps2dq(xmmA, xmmB);
  cc.evex().vcvtps2dq(ymmA, ymmB);
  cc.evex().vcvtps2dq(zmmA, zmmB);
  cc.evex().vcvtps2pd(xmmA, xmmB);
  cc.evex().vcvtps2pd(ymmA, xmmB);
  cc.evex().vcvtps2pd(zmmA, ymmB);
  cc.evex().vcvtps2ph(xmmA, xmmB, 0);
  cc.evex().vcvtps2ph(xmmA, ymmB, 0);
  cc.evex().vcvtps2ph(ymmA, zmmB, 0);
  cc.evex().vcvtps2qq(xmmA, xmmB);
  cc.evex().vcvtps2qq(ymmA, xmmB);
  cc.evex().vcvtps2qq(zmmA, ymmB);
  cc.evex().vcvtps2udq(xmmA, xmmB);
  cc.evex().vcvtps2udq(ymmA, ymmB);
  cc.evex().vcvtps2udq(zmmA, zmmB);
  cc.evex().vcvtps2uqq(xmmA, xmmB);
  cc.evex().vcvtps2uqq(ymmA, xmmB);
  cc.evex().vcvtps2uqq(zmmA, ymmB);
  cc.evex().vcvtqq2pd(xmmA, xmmB);
  cc.evex().vcvtqq2pd(ymmA, ymmB);
  cc.evex().vcvtqq2pd(zmmA, zmmB);
  cc.evex().vcvtqq2ps(xmmA, xmmB);
  cc.evex().vcvtqq2ps(xmmA, ymmB);
  cc.evex().vcvtqq2ps(ymmA, zmmB);
  cc.evex().vcvtsd2si(gpd, xmmB);
  cc.evex().vcvtsd2si(gpz, xmmB);
  cc.evex().vcvtsd2ss(xmmA, xmmB, xmmC);
  cc.evex().vcvtsd2usi(gpd, xmmB);
  cc.evex().vcvtsd2usi(gpz, xmmB);
  cc.evex().vcvtsi2sd(xmmA, xmmB, gpd);
  cc.evex().vcvtsi2sd(xmmA, xmmB, gpz);
  cc.evex().vcvtsi2ss(xmmA, xmmB, gpd);
  cc.evex().vcvtsi2ss(xmmA, xmmB, gpz);
  cc.evex().vcvtss2sd(xmmA, xmmB, xmmC);
  cc.evex().vcvtss2si(gpd, xmmB);
  cc.evex().vcvtss2si(gpz, xmmB);
  cc.evex().vcvtss2usi(gpd, xmmB);
  cc.evex().vcvtss2usi(gpz, xmmB);
  cc.evex().vcvttpd2dq(xmmA, xmmB);
  cc.evex().vcvttpd2dq(xmmA, ymmB);
  cc.evex().vcvttpd2dq(ymmA, zmmB);
  cc.evex().vcvttpd2qq(xmmA, xmmB);
  cc.evex().vcvttpd2qq(ymmA, ymmB);
  cc.evex().vcvttpd2qq(zmmA, zmmB);
  cc.evex().vcvttpd2udq(xmmA, xmmB);
  cc.evex().vcvttpd2udq(xmmA, ymmB);
  cc.evex().vcvttpd2udq(ymmA, zmmB);
  cc.evex().vcvttpd2uqq(xmmA, xmmB);
  cc.evex().vcvttpd2uqq(ymmA, ymmB);
  cc.evex().vcvttpd2uqq(zmmA, zmmB);
  cc.evex().vcvttps2dq(xmmA, xmmB);
  cc.evex().vcvttps2dq(ymmA, ymmB);
  cc.evex().vcvttps2dq(zmmA, zmmB);
  cc.evex().vcvttps2qq(xmmA, xmmB);
  cc.evex().vcvttps2qq(ymmA, xmmB);
  cc.evex().vcvttps2qq(zmmA, ymmB);
  cc.evex().vcvttps2udq(xmmA, xmmB);
  cc.evex().vcvttps2udq(ymmA, ymmB);
  cc.evex().vcvttps2udq(zmmA, zmmB);
  cc.evex().vcvttps2uqq(xmmA, xmmB);
  cc.evex().vcvttps2uqq(ymmA, xmmB);
  cc.evex().vcvttps2uqq(zmmA, ymmB);
  cc.evex().vcvttsd2si(gpd, xmmB);
  cc.evex().vcvttsd2si(gpz, xmmB);
  cc.evex().vcvttsd2usi(gpd, xmmB);
  cc.evex().vcvttsd2usi(gpz, xmmB);
  cc.evex().vcvttss2si(gpd, xmmB);
  cc.evex().vcvttss2si(gpz, xmmB);
  cc.evex().vcvttss2usi(gpd, xmmB);
  cc.evex().vcvttss2usi(gpz, xmmB);
  cc.evex().vcvtudq2pd(xmmA, xmmB);
  cc.evex().vcvtudq2pd(ymmA, xmmB);
  cc.evex().vcvtudq2pd(zmmA, ymmB);
  cc.evex().vcvtudq2ps(xmmA, xmmB);
  cc.evex().vcvtudq2ps(ymmA, ymmB);
  cc.evex().vcvtudq2ps(zmmA, zmmB);
  cc.evex().vcvtuqq2pd(xmmA, xmmB);
  cc.evex().vcvtuqq2pd(ymmA, ymmB);
  cc.evex().vcvtuqq2pd(zmmA, zmmB);
  cc.evex().vcvtuqq2ps(xmmA, xmmB);
  cc.evex().vcvtuqq2ps(xmmA, ymmB);
  cc.evex().vcvtuqq2ps(ymmA, zmmB);
  cc.evex().vcvtusi2sd(xmmA, xmmB, gpd);
  cc.evex().vcvtusi2sd(xmmA, xmmB, gpz);
  cc.evex().vcvtusi2ss(xmmA, xmmB, gpd);
  cc.evex().vcvtusi2ss(xmmA, xmmB, gpz);
  cc.evex().vdbpsadbw(xmmA, xmmB, xmmC, 0);
  cc.evex().vdbpsadbw(ymmA, ymmB, ymmC, 0);
  cc.evex().vdbpsadbw(zmmA, zmmB, zmmC, 0);
  cc.evex().vdivpd(xmmA, xmmB, xmmC);
  cc.evex().vdivpd(ymmA, ymmB, ymmC);
  cc.evex().vdivpd(zmmA, zmmB, zmmC);
  cc.evex().vdivps(xmmA, xmmB, xmmC);
  cc.evex().vdivps(ymmA, ymmB, ymmC);
  cc.evex().vdivps(zmmA, zmmB, zmmC);
  cc.evex().vdivsd(xmmA, xmmB, xmmC);
  cc.evex().vdivss(xmmA, xmmB, xmmC);
  cc.evex().vexp2pd(zmmA, zmmB);
  cc.evex().vexp2ps(zmmA, zmmB);
  cc.evex().vexpandpd(xmmA, xmmB);
  cc.evex().vexpandpd(ymmA, ymmB);
  cc.evex().vexpandpd(zmmA, zmmB);
  cc.evex().vexpandps(xmmA, xmmB);
  cc.evex().vexpandps(ymmA, ymmB);
  cc.evex().vexpandps(zmmA, zmmB);
  cc.evex().vextractf32x4(xmmA, ymmB, 0);
  cc.evex().vextractf32x4(xmmA, zmmB, 0);
  cc.evex().vextractf32x8(ymmA, zmmB, 0);
  cc.evex().vextractf64x2(xmmA, ymmB, 0);
  cc.evex().vextractf64x2(xmmA, zmmB, 0);
  cc.evex().vextractf64x4(ymmA, zmmB, 0);
  cc.evex().vextracti32x4(xmmA, ymmB, 0);
  cc.evex().vextracti32x4(xmmA, zmmB, 0);
  cc.evex().vextracti32x8(ymmA, zmmB, 0);
  cc.evex().vextracti64x2(xmmA, ymmB, 0);
  cc.evex().vextracti64x2(xmmA, zmmB, 0);
  cc.evex().vextracti64x4(ymmA, zmmB, 0);
  cc.evex().vextractps(gpd, xmmB, 0);
  cc.evex().vfixupimmpd(xmmA, xmmB, xmmC, 0);
  cc.evex().vfixupimmpd(ymmA, ymmB, ymmC, 0);
  cc.evex().vfixupimmpd(zmmA, zmmB, zmmC, 0);
  cc.evex().vfixupimmps(xmmA, xmmB, xmmC, 0);
  cc.evex().vfixupimmps(ymmA, ymmB, ymmC, 0);
  cc.evex().vfixupimmps(zmmA, zmmB, zmmC, 0);
  cc.evex().vfixupimmsd(xmmA, xmmB, xmmC, 0);
  cc.evex().vfixupimmss(xmmA, xmmB, xmmC, 0);
  cc.evex().vfmadd132pd(xmmA, xmmB, xmmC);
  cc.evex().vfmadd132pd(ymmA, ymmB, ymmC);
  cc.evex().vfmadd132pd(zmmA, zmmB, zmmC);
  cc.evex().vfmadd132ps(xmmA, xmmB, xmmC);
  cc.evex().vfmadd132ps(ymmA, ymmB, ymmC);
  cc.evex().vfmadd132ps(zmmA, zmmB, zmmC);
  cc.evex().vfmadd132sd(xmmA, xmmB, xmmC);
  cc.evex().vfmadd132ss(xmmA, xmmB, xmmC);
  cc.evex().vfmadd213pd(xmmA, xmmB, xmmC);
  cc.evex().vfmadd213pd(ymmA, ymmB, ymmC);
  cc.evex().vfmadd213pd(zmmA, zmmB, zmmC);
  cc.evex().vfmadd213ps(xmmA, xmmB, xmmC);
  cc.evex().vfmadd213ps(ymmA, ymmB, ymmC);
  cc.evex().vfmadd213ps(zmmA, zmmB, zmmC);
  cc.evex().vfmadd213sd(xmmA, xmmB, xmmC);
  cc.evex().vfmadd213ss(xmmA, xmmB, xmmC);
  cc.evex().vfmadd231pd(xmmA, xmmB, xmmC);
  cc.evex().vfmadd231pd(ymmA, ymmB, ymmC);
  cc.evex().vfmadd231pd(zmmA, zmmB, zmmC);
  cc.evex().vfmadd231ps(xmmA, xmmB, xmmC);
  cc.evex().vfmadd231ps(ymmA, ymmB, ymmC);
  cc.evex().vfmadd231ps(zmmA, zmmB, zmmC);
  cc.evex().vfmadd231sd(xmmA, xmmB, xmmC);
  cc.evex().vfmadd231ss(xmmA, xmmB, xmmC);
  cc.evex().vfmaddsub132pd(xmmA, xmmB, xmmC);
  cc.evex().vfmaddsub132pd(ymmA, ymmB, ymmC);
  cc.evex().vfmaddsub132pd(zmmA, zmmB, zmmC);
  cc.evex().vfmaddsub132ps(xmmA, xmmB, xmmC);
  cc.evex().vfmaddsub132ps(ymmA, ymmB, ymmC);
  cc.evex().vfmaddsub132ps(zmmA, zmmB, zmmC);
  cc.evex().vfmaddsub213pd(xmmA, xmmB, xmmC);
  cc.evex().vfmaddsub213pd(ymmA, ymmB, ymmC);
  cc.evex().vfmaddsub213pd(zmmA, zmmB, zmmC);
  cc.evex().vfmaddsub213ps(xmmA, xmmB, xmmC);
  cc.evex().vfmaddsub213ps(ymmA, ymmB, ymmC);
  cc.evex().vfmaddsub213ps(zmmA, zmmB, zmmC);
  cc.evex().vfmaddsub231pd(xmmA, xmmB, xmmC);
  cc.evex().vfmaddsub231pd(ymmA, ymmB, ymmC);
  cc.evex().vfmaddsub231pd(zmmA, zmmB, zmmC);
  cc.evex().vfmaddsub231ps(xmmA, xmmB, xmmC);
  cc.evex().vfmaddsub231ps(ymmA, ymmB, ymmC);
  cc.evex().vfmaddsub231ps(zmmA, zmmB, zmmC);
  cc.evex().vfmsub132pd(xmmA, xmmB, xmmC);
  cc.evex().vfmsub132pd(ymmA, ymmB, ymmC);
  cc.evex().vfmsub132pd(zmmA, zmmB, zmmC);
  cc.evex().vfmsub132ps(xmmA, xmmB, xmmC);
  cc.evex().vfmsub132ps(ymmA, ymmB, ymmC);
  cc.evex().vfmsub132ps(zmmA, zmmB, zmmC);
  cc.evex().vfmsub132sd(xmmA, xmmB, xmmC);
  cc.evex().vfmsub132ss(xmmA, xmmB, xmmC);
  cc.evex().vfmsub213pd(xmmA, xmmB, xmmC);
  cc.evex().vfmsub213pd(ymmA, ymmB, ymmC);
  cc.evex().vfmsub213pd(zmmA, zmmB, zmmC);
  cc.evex().vfmsub213ps(xmmA, xmmB, xmmC);
  cc.evex().vfmsub213ps(ymmA, ymmB, ymmC);
  cc.evex().vfmsub213ps(zmmA, zmmB, zmmC);
  cc.evex().vfmsub213sd(xmmA, xmmB, xmmC);
  cc.evex().vfmsub213ss(xmmA, xmmB, xmmC);
  cc.evex().vfmsub231pd(xmmA, xmmB, xmmC);
  cc.evex().vfmsub231pd(ymmA, ymmB, ymmC);
  cc.evex().vfmsub231pd(zmmA, zmmB, zmmC);
  cc.evex().vfmsub231ps(xmmA, xmmB, xmmC);
  cc.evex().vfmsub231ps(ymmA, ymmB, ymmC);
  cc.evex().vfmsub231ps(zmmA, zmmB, zmmC);
  cc.evex().vfmsub231sd(xmmA, xmmB, xmmC);
  cc.evex().vfmsub231ss(xmmA, xmmB, xmmC);
  cc.evex().vfmsubadd132pd(xmmA, xmmB, xmmC);
  cc.evex().vfmsubadd132pd(ymmA, ymmB, ymmC);
  cc.evex().vfmsubadd132pd(zmmA, zmmB, zmmC);
  cc.evex().vfmsubadd132ps(xmmA, xmmB, xmmC);
  cc.evex().vfmsubadd132ps(ymmA, ymmB, ymmC);
  cc.evex().vfmsubadd132ps(zmmA, zmmB, zmmC);
  cc.evex().vfmsubadd213pd(xmmA, xmmB, xmmC);
  cc.evex().vfmsubadd213pd(ymmA, ymmB, ymmC);
  cc.evex().vfmsubadd213pd(zmmA, zmmB, zmmC);
  cc.evex().vfmsubadd213ps(xmmA, xmmB, xmmC);
  cc.evex().vfmsubadd213ps(ymmA, ymmB, ymmC);
  cc.evex().vfmsubadd213ps(zmmA, zmmB, zmmC);
  cc.evex().vfmsubadd231pd(xmmA, xmmB, xmmC);
  cc.evex().vfmsubadd231pd(ymmA, ymmB, ymmC);
  cc.evex().vfmsubadd231pd(zmmA, zmmB, zmmC);
  cc.evex().vfmsubadd231ps(xmmA, xmmB, xmmC);
  cc.evex().vfmsubadd231ps(ymmA, ymmB, ymmC);
  cc.evex().vfmsubadd231ps(zmmA, zmmB, zmmC);
  cc.evex().vfnmadd132pd(xmmA, xmmB, xmmC);
  cc.evex().vfnmadd132pd(ymmA, ymmB, ymmC);
  cc.evex().vfnmadd132pd(zmmA, zmmB, zmmC);
  cc.evex().vfnmadd132ps(xmmA, xmmB, xmmC);
  cc.evex().vfnmadd132ps(ymmA, ymmB, ymmC);
  cc.evex().vfnmadd132ps(zmmA, zmmB, zmmC);
  cc.evex().vfnmadd132sd(xmmA, xmmB, xmmC);
  cc.evex().vfnmadd132ss(xmmA, xmmB, xmmC);
  cc.evex().vfnmadd213pd(xmmA, xmmB, xmmC);
  cc.evex().vfnmadd213pd(ymmA, ymmB, ymmC);
  cc.evex().vfnmadd213pd(zmmA, zmmB, zmmC);
  cc.evex().vfnmadd213ps(xmmA, xmmB, xmmC);
  cc.evex().vfnmadd213ps(ymmA, ymmB, ymmC);
  cc.evex().vfnmadd213ps(zmmA, zmmB, zmmC);
  cc.evex().vfnmadd213sd(xmmA, xmmB, xmmC);
  cc.evex().vfnmadd213ss(xmmA, xmmB, xmmC);
  cc.evex().vfnmadd231pd(xmmA, xmmB, xmmC);
  cc.evex().vfnmadd231pd(ymmA, ymmB, ymmC);
  cc.evex().vfnmadd231pd(zmmA, zmmB, zmmC);
  cc.evex().vfnmadd231ps(xmmA, xmmB, xmmC);
  cc.evex().vfnmadd231ps(ymmA, ymmB, ymmC);
  cc.evex().vfnmadd231ps(zmmA, zmmB, zmmC);
  cc.evex().vfnmadd231sd(xmmA, xmmB, xmmC);
  cc.evex().vfnmadd231ss(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub132pd(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub132pd(ymmA, ymmB, ymmC);
  cc.evex().vfnmsub132pd(zmmA, zmmB, zmmC);
  cc.evex().vfnmsub132ps(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub132ps(ymmA, ymmB, ymmC);
  cc.evex().vfnmsub132ps(zmmA, zmmB, zmmC);
  cc.evex().vfnmsub132sd(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub132ss(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub213pd(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub213pd(ymmA, ymmB, ymmC);
  cc.evex().vfnmsub213pd(zmmA, zmmB, zmmC);
  cc.evex().vfnmsub213ps(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub213ps(ymmA, ymmB, ymmC);
  cc.evex().vfnmsub213ps(zmmA, zmmB, zmmC);
  cc.evex().vfnmsub213sd(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub213ss(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub231pd(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub231pd(ymmA, ymmB, ymmC);
  cc.evex().vfnmsub231pd(zmmA, zmmB, zmmC);
  cc.evex().vfnmsub231ps(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub231ps(ymmA, ymmB, ymmC);
  cc.evex().vfnmsub231ps(zmmA, zmmB, zmmC);
  cc.evex().vfnmsub231sd(xmmA, xmmB, xmmC);
  cc.evex().vfnmsub231ss(xmmA, xmmB, xmmC);
  cc.evex().vfpclasspd(kA, xmmB, 0);
  cc.evex().vfpclasspd(kA, ymmB, 0);
  cc.evex().vfpclasspd(kA, zmmB, 0);
  cc.evex().vfpclassps(kA, xmmB, 0);
  cc.evex().vfpclassps(kA, ymmB, 0);
  cc.evex().vfpclassps(kA, zmmB, 0);
  cc.evex().vfpclasssd(kA, xmmB, 0);
  cc.evex().vfpclassss(kA, xmmB, 0);
  cc.evex().vgetexppd(xmmA, xmmB);
  cc.evex().vgetexppd(ymmA, ymmB);
  cc.evex().vgetexppd(zmmA, zmmB);
  cc.evex().vgetexpps(xmmA, xmmB);
  cc.evex().vgetexpps(ymmA, ymmB);
  cc.evex().vgetexpps(zmmA, zmmB);
  cc.evex().vgetexpsd(xmmA, xmmB, xmmC);
  cc.evex().vgetexpss(xmmA, xmmB, xmmC);
  cc.evex().vgetmantpd(xmmA, xmmB, 0);
  cc.evex().vgetmantpd(ymmA, ymmB, 0);
  cc.evex().vgetmantpd(zmmA, zmmB, 0);
  cc.evex().vgetmantps(xmmA, xmmB, 0);
  cc.evex().vgetmantps(ymmA, ymmB, 0);
  cc.evex().vgetmantps(zmmA, zmmB, 0);
  cc.evex().vgetmantsd(xmmA, xmmB, xmmC, 0);
  cc.evex().vgetmantss(xmmA, xmmB, xmmC, 0);
  cc.evex().vinsertf32x4(ymmA, ymmB, xmmC, 0);
  cc.evex().vinsertf32x4(zmmA, zmmB, xmmC, 0);
  cc.evex().vinsertf32x8(zmmA, zmmB, ymmC, 0);
  cc.evex().vinsertf64x2(ymmA, ymmB, xmmC, 0);
  cc.evex().vinsertf64x2(zmmA, zmmB, xmmC, 0);
  cc.evex().vinsertf64x4(zmmA, zmmB, ymmC, 0);
  cc.evex().vinserti32x4(ymmA, ymmB, xmmC, 0);
  cc.evex().vinserti32x4(zmmA, zmmB, xmmC, 0);
  cc.evex().vinserti32x8(zmmA, zmmB, ymmC, 0);
  cc.evex().vinserti64x2(ymmA, ymmB, xmmC, 0);
  cc.evex().vinserti64x2(zmmA, zmmB, xmmC, 0);
  cc.evex().vinserti64x4(zmmA, zmmB, ymmC, 0);
  cc.evex().vinsertps(xmmA, xmmB, xmmC, 0);
  cc.evex().vmaxpd(xmmA, xmmB, xmmC);
  cc.evex().vmaxpd(ymmA, ymmB, ymmC);
  cc.evex().vmaxpd(zmmA, zmmB, zmmC);
  cc.evex().vmaxps(xmmA, xmmB, xmmC);
  cc.evex().vmaxps(ymmA, ymmB, ymmC);
  cc.evex().vmaxps(zmmA, zmmB, zmmC);
  cc.evex().vmaxsd(xmmA, xmmB, xmmC);
  cc.evex().vmaxss(xmmA, xmmB, xmmC);
  cc.evex().vminpd(xmmA, xmmB, xmmC);
  cc.evex().vminpd(ymmA, ymmB, ymmC);
  cc.evex().vminpd(zmmA, zmmB, zmmC);
  cc.evex().vminps(xmmA, xmmB, xmmC);
  cc.evex().vminps(ymmA, ymmB, ymmC);
  cc.evex().vminps(zmmA, zmmB, zmmC);
  cc.evex().vminsd(xmmA, xmmB, xmmC);
  cc.evex().vminss(xmmA, xmmB, xmmC);
  cc.evex().vmovapd(xmmA, xmmB);
  cc.evex().vmovapd(xmmA, xmmB);
  cc.evex().vmovapd(ymmA, ymmB);
  cc.evex().vmovapd(ymmA, ymmB);
  cc.evex().vmovapd(zmmA, zmmB);
  cc.evex().vmovapd(zmmA, zmmB);
  cc.evex().vmovaps(xmmA, xmmB);
  cc.evex().vmovaps(xmmA, xmmB);
  cc.evex().vmovaps(ymmA, ymmB);
  cc.evex().vmovaps(ymmA, ymmB);
  cc.evex().vmovaps(zmmA, zmmB);
  cc.evex().vmovaps(zmmA, zmmB);
  cc.evex().vmovd(gpd, xmmB);
  cc.evex().vmovd(xmmA, gpd);
  cc.evex().vmovddup(xmmA, xmmB);
  cc.evex().vmovddup(ymmA, ymmB);
  cc.evex().vmovddup(zmmA, zmmB);
  cc.evex().vmovdqa32(xmmA, xmmB);
  cc.evex().vmovdqa32(xmmA, xmmB);
  cc.evex().vmovdqa32(ymmA, ymmB);
  cc.evex().vmovdqa32(ymmA, ymmB);
  cc.evex().vmovdqa32(zmmA, zmmB);
  cc.evex().vmovdqa32(zmmA, zmmB);
  cc.evex().vmovdqa64(xmmA, xmmB);
  cc.evex().vmovdqa64(xmmA, xmmB);
  cc.evex().vmovdqa64(ymmA, ymmB);
  cc.evex().vmovdqa64(ymmA, ymmB);
  cc.evex().vmovdqa64(zmmA, zmmB);
  cc.evex().vmovdqa64(zmmA, zmmB);
  cc.evex().vmovdqu16(xmmA, xmmB);
  cc.evex().vmovdqu16(xmmA, xmmB);
  cc.evex().vmovdqu16(ymmA, ymmB);
  cc.evex().vmovdqu16(ymmA, ymmB);
  cc.evex().vmovdqu16(zmmA, zmmB);
  cc.evex().vmovdqu16(zmmA, zmmB);
  cc.evex().vmovdqu32(xmmA, xmmB);
  cc.evex().vmovdqu32(xmmA, xmmB);
  cc.evex().vmovdqu32(ymmA, ymmB);
  cc.evex().vmovdqu32(ymmA, ymmB);
  cc.evex().vmovdqu32(zmmA, zmmB);
  cc.evex().vmovdqu32(zmmA, zmmB);
  cc.evex().vmovdqu64(xmmA, xmmB);
  cc.evex().vmovdqu64(xmmA, xmmB);
  cc.evex().vmovdqu64(ymmA, ymmB);
  cc.evex().vmovdqu64(ymmA, ymmB);
  cc.evex().vmovdqu64(zmmA, zmmB);
  cc.evex().vmovdqu64(zmmA, zmmB);
  cc.evex().vmovdqu8(xmmA, xmmB);
  cc.evex().vmovdqu8(xmmA, xmmB);
  cc.evex().vmovdqu8(ymmA, ymmB);
  cc.evex().vmovdqu8(ymmA, ymmB);
  cc.evex().vmovdqu8(zmmA, zmmB);
  cc.evex().vmovdqu8(zmmA, zmmB);
  cc.evex().vmovhlps(xmmA, xmmB, xmmC);
  if (cc.is64Bit()) cc.evex().vmovq(gpq, xmmB);
  if (cc.is64Bit()) cc.evex().vmovq(xmmA, gpq);
  cc.evex().vmovq(xmmA, xmmB);
  cc.evex().vmovsd(xmmA, xmmB, xmmC);
  cc.evex().vmovshdup(xmmA, xmmB);
  cc.evex().vmovshdup(ymmA, ymmB);
  cc.evex().vmovshdup(zmmA, zmmB);
  cc.evex().vmovsldup(xmmA, xmmB);
  cc.evex().vmovsldup(ymmA, ymmB);
  cc.evex().vmovsldup(zmmA, zmmB);
  cc.evex().vmovss(xmmA, xmmB, xmmC);
  cc.evex().vmovupd(xmmA, xmmB);
  cc.evex().vmovupd(xmmA, xmmB);
  cc.evex().vmovupd(ymmA, ymmB);
  cc.evex().vmovupd(ymmA, ymmB);
  cc.evex().vmovupd(zmmA, zmmB);
  cc.evex().vmovupd(zmmA, zmmB);
  cc.evex().vmovups(xmmA, xmmB);
  cc.evex().vmovups(xmmA, xmmB);
  cc.evex().vmovups(ymmA, ymmB);
  cc.evex().vmovups(ymmA, ymmB);
  cc.evex().vmovups(zmmA, zmmB);
  cc.evex().vmovups(zmmA, zmmB);
  cc.evex().vmulpd(xmmA, xmmB, xmmC);
  cc.evex().vmulpd(ymmA, ymmB, ymmC);
  cc.evex().vmulpd(zmmA, zmmB, zmmC);
  cc.evex().vmulps(xmmA, xmmB, xmmC);
  cc.evex().vmulps(ymmA, ymmB, ymmC);
  cc.evex().vmulps(zmmA, zmmB, zmmC);
  cc.evex().vmulsd(xmmA, xmmB, xmmC);
  cc.evex().vmulss(xmmA, xmmB, xmmC);
  cc.evex().vorpd(xmmA, xmmB, xmmC);
  cc.evex().vorpd(ymmA, ymmB, ymmC);
  cc.evex().vorpd(zmmA, zmmB, zmmC);
  cc.evex().vorps(xmmA, xmmB, xmmC);
  cc.evex().vorps(ymmA, ymmB, ymmC);
  cc.evex().vorps(zmmA, zmmB, zmmC);
  cc.evex().vpabsb(xmmA, xmmB);
  cc.evex().vpabsb(ymmA, ymmB);
  cc.evex().vpabsb(zmmA, zmmB);
  cc.evex().vpabsd(xmmA, xmmB);
  cc.evex().vpabsd(ymmA, ymmB);
  cc.evex().vpabsd(zmmA, zmmB);
  cc.evex().vpabsq(xmmA, xmmB);
  cc.evex().vpabsq(ymmA, ymmB);
  cc.evex().vpabsq(zmmA, zmmB);
  cc.evex().vpabsw(xmmA, xmmB);
  cc.evex().vpabsw(ymmA, ymmB);
  cc.evex().vpabsw(zmmA, zmmB);
  cc.evex().vpackssdw(xmmA, xmmB, xmmC);
  cc.evex().vpackssdw(ymmA, ymmB, ymmC);
  cc.evex().vpackssdw(zmmA, zmmB, zmmC);
  cc.evex().vpacksswb(xmmA, xmmB, xmmC);
  cc.evex().vpacksswb(ymmA, ymmB, ymmC);
  cc.evex().vpacksswb(zmmA, zmmB, zmmC);
  cc.evex().vpackusdw(xmmA, xmmB, xmmC);
  cc.evex().vpackusdw(ymmA, ymmB, ymmC);
  cc.evex().vpackusdw(zmmA, zmmB, zmmC);
  cc.evex().vpackuswb(xmmA, xmmB, xmmC);
  cc.evex().vpackuswb(ymmA, ymmB, ymmC);
  cc.evex().vpackuswb(zmmA, zmmB, zmmC);
  cc.evex().vpaddb(xmmA, xmmB, xmmC);
  cc.evex().vpaddb(ymmA, ymmB, ymmC);
  cc.evex().vpaddb(zmmA, zmmB, zmmC);
  cc.evex().vpaddd(xmmA, xmmB, xmmC);
  cc.evex().vpaddd(ymmA, ymmB, ymmC);
  cc.evex().vpaddd(zmmA, zmmB, zmmC);
  cc.evex().vpaddq(xmmA, xmmB, xmmC);
  cc.evex().vpaddq(ymmA, ymmB, ymmC);
  cc.evex().vpaddq(zmmA, zmmB, zmmC);
  cc.evex().vpaddsb(xmmA, xmmB, xmmC);
  cc.evex().vpaddsb(ymmA, ymmB, ymmC);
  cc.evex().vpaddsb(zmmA, zmmB, zmmC);
  cc.evex().vpaddsw(xmmA, xmmB, xmmC);
  cc.evex().vpaddsw(ymmA, ymmB, ymmC);
  cc.evex().vpaddsw(zmmA, zmmB, zmmC);
  cc.evex().vpaddusb(xmmA, xmmB, xmmC);
  cc.evex().vpaddusb(ymmA, ymmB, ymmC);
  cc.evex().vpaddusb(zmmA, zmmB, zmmC);
  cc.evex().vpaddusw(xmmA, xmmB, xmmC);
  cc.evex().vpaddusw(ymmA, ymmB, ymmC);
  cc.evex().vpaddusw(zmmA, zmmB, zmmC);
  cc.evex().vpaddw(xmmA, xmmB, xmmC);
  cc.evex().vpaddw(ymmA, ymmB, ymmC);
  cc.evex().vpaddw(zmmA, zmmB, zmmC);
  cc.evex().vpalignr(xmmA, xmmB, xmmC, 0);
  cc.evex().vpalignr(ymmA, ymmB, ymmC, 0);
  cc.evex().vpalignr(zmmA, zmmB, zmmC, 0);
  cc.evex().vpandd(xmmA, xmmB, xmmC);
  cc.evex().vpandd(ymmA, ymmB, ymmC);
  cc.evex().vpandd(zmmA, zmmB, zmmC);
  cc.evex().vpandnd(xmmA, xmmB, xmmC);
  cc.evex().vpandnd(ymmA, ymmB, ymmC);
  cc.evex().vpandnd(zmmA, zmmB, zmmC);
  cc.evex().vpandnq(xmmA, xmmB, xmmC);
  cc.evex().vpandnq(ymmA, ymmB, ymmC);
  cc.evex().vpandnq(zmmA, zmmB, zmmC);
  cc.evex().vpandq(xmmA, xmmB, xmmC);
  cc.evex().vpandq(ymmA, ymmB, ymmC);
  cc.evex().vpandq(zmmA, zmmB, zmmC);
  cc.evex().vpavgb(xmmA, xmmB, xmmC);
  cc.evex().vpavgb(ymmA, ymmB, ymmC);
  cc.evex().vpavgb(zmmA, zmmB, zmmC);
  cc.evex().vpavgw(xmmA, xmmB, xmmC);
  cc.evex().vpavgw(ymmA, ymmB, ymmC);
  cc.evex().vpavgw(zmmA, zmmB, zmmC);
  cc.evex().vpblendmb(xmmA, xmmB, xmmC);
  cc.evex().vpblendmb(ymmA, ymmB, ymmC);
  cc.evex().vpblendmb(zmmA, zmmB, zmmC);
  cc.evex().vpblendmd(xmmA, xmmB, xmmC);
  cc.evex().vpblendmd(ymmA, ymmB, ymmC);
  cc.evex().vpblendmd(zmmA, zmmB, zmmC);
  cc.evex().vpblendmq(xmmA, xmmB, xmmC);
  cc.evex().vpblendmq(ymmA, ymmB, ymmC);
  cc.evex().vpblendmq(zmmA, zmmB, zmmC);
  cc.evex().vpblendmw(xmmA, xmmB, xmmC);
  cc.evex().vpblendmw(ymmA, ymmB, ymmC);
  cc.evex().vpblendmw(zmmA, zmmB, zmmC);
  cc.evex().vpbroadcastb(xmmA, gpd);
  cc.evex().vpbroadcastb(xmmA, xmmB);
  cc.evex().vpbroadcastb(ymmA, gpd);
  cc.evex().vpbroadcastb(ymmA, xmmB);
  cc.evex().vpbroadcastb(zmmA, gpd);
  cc.evex().vpbroadcastb(zmmA, xmmB);
  cc.evex().vpbroadcastd(xmmA, gpd);
  cc.evex().vpbroadcastd(xmmA, xmmB);
  cc.evex().vpbroadcastd(ymmA, gpd);
  cc.evex().vpbroadcastd(ymmA, xmmB);
  cc.evex().vpbroadcastd(zmmA, gpd);
  cc.evex().vpbroadcastd(zmmA, xmmB);
  cc.evex().vpbroadcastmb2q(xmmA, kB);
  cc.evex().vpbroadcastmb2q(ymmA, kB);
  cc.evex().vpbroadcastmb2q(zmmA, kB);
  cc.evex().vpbroadcastmw2d(xmmA, kB);
  cc.evex().vpbroadcastmw2d(ymmA, kB);
  cc.evex().vpbroadcastmw2d(zmmA, kB);
  if (cc.is64Bit()) cc.evex().vpbroadcastq(xmmA, gpq);
  cc.evex().vpbroadcastq(xmmA, xmmB);
  if (cc.is64Bit()) cc.evex().vpbroadcastq(ymmA, gpq);
  cc.evex().vpbroadcastq(ymmA, xmmB);
  if (cc.is64Bit()) cc.evex().vpbroadcastq(zmmA, gpq);
  cc.evex().vpbroadcastq(zmmA, xmmB);
  cc.evex().vpbroadcastw(xmmA, gpd);
  cc.evex().vpbroadcastw(xmmA, xmmB);
  cc.evex().vpbroadcastw(ymmA, gpd);
  cc.evex().vpbroadcastw(ymmA, xmmB);
  cc.evex().vpbroadcastw(zmmA, gpd);
  cc.evex().vpbroadcastw(zmmA, xmmB);
  cc.evex().vpcmpb(kA, xmmB, xmmC, 0);
  cc.evex().vpcmpb(kA, ymmB, ymmC, 0);
  cc.evex().vpcmpb(kA, zmmB, zmmC, 0);
  cc.evex().vpcmpd(kA, xmmB, xmmC, 0);
  cc.evex().vpcmpd(kA, ymmB, ymmC, 0);
  cc.evex().vpcmpd(kA, zmmB, zmmC, 0);
  cc.evex().vpcmpeqb(kA, xmmB, xmmC);
  cc.evex().vpcmpeqb(kA, ymmB, ymmC);
  cc.evex().vpcmpeqb(kA, zmmB, zmmC);
  cc.evex().vpcmpeqd(kA, xmmB, xmmC);
  cc.evex().vpcmpeqd(kA, ymmB, ymmC);
  cc.evex().vpcmpeqd(kA, zmmB, zmmC);
  cc.evex().vpcmpeqq(kA, xmmB, xmmC);
  cc.evex().vpcmpeqq(kA, ymmB, ymmC);
  cc.evex().vpcmpeqq(kA, zmmB, zmmC);
  cc.evex().vpcmpeqw(kA, xmmB, xmmC);
  cc.evex().vpcmpeqw(kA, ymmB, ymmC);
  cc.evex().vpcmpeqw(kA, zmmB, zmmC);
  cc.evex().vpcmpgtb(kA, xmmB, xmmC);
  cc.evex().vpcmpgtb(kA, ymmB, ymmC);
  cc.evex().vpcmpgtb(kA, zmmB, zmmC);
  cc.evex().vpcmpgtd(kA, xmmB, xmmC);
  cc.evex().vpcmpgtd(kA, ymmB, ymmC);
  cc.evex().vpcmpgtd(kA, zmmB, zmmC);
  cc.evex().vpcmpgtq(kA, xmmB, xmmC);
  cc.evex().vpcmpgtq(kA, ymmB, ymmC);
  cc.evex().vpcmpgtq(kA, zmmB, zmmC);
  cc.evex().vpcmpgtw(kA, xmmB, xmmC);
  cc.evex().vpcmpgtw(kA, ymmB, ymmC);
  cc.evex().vpcmpgtw(kA, zmmB, zmmC);
  cc.evex().vpcmpq(kA, xmmB, xmmC, 0);
  cc.evex().vpcmpq(kA, ymmB, ymmC, 0);
  cc.evex().vpcmpq(kA, zmmB, zmmC, 0);
  cc.evex().vpcmpub(kA, xmmB, xmmC, 0);
  cc.evex().vpcmpub(kA, ymmB, ymmC, 0);
  cc.evex().vpcmpub(kA, zmmB, zmmC, 0);
  cc.evex().vpcmpud(kA, xmmB, xmmC, 0);
  cc.evex().vpcmpud(kA, ymmB, ymmC, 0);
  cc.evex().vpcmpud(kA, zmmB, zmmC, 0);
  cc.evex().vpcmpuq(kA, xmmB, xmmC, 0);
  cc.evex().vpcmpuq(kA, ymmB, ymmC, 0);
  cc.evex().vpcmpuq(kA, zmmB, zmmC, 0);
  cc.evex().vpcmpuw(kA, xmmB, xmmC, 0);
  cc.evex().vpcmpuw(kA, ymmB, ymmC, 0);
  cc.evex().vpcmpuw(kA, zmmB, zmmC, 0);
  cc.evex().vpcmpw(kA, xmmB, xmmC, 0);
  cc.evex().vpcmpw(kA, ymmB, ymmC, 0);
  cc.evex().vpcmpw(kA, zmmB, zmmC, 0);
  cc.evex().vpcompressd(xmmA, xmmB);
  cc.evex().vpcompressd(ymmA, ymmB);
  cc.evex().vpcompressd(zmmA, zmmB);
  cc.evex().vpcompressq(xmmA, xmmB);
  cc.evex().vpcompressq(ymmA, ymmB);
  cc.evex().vpcompressq(zmmA, zmmB);
  cc.evex().vpconflictd(xmmA, xmmB);
  cc.evex().vpconflictd(ymmA, ymmB);
  cc.evex().vpconflictd(zmmA, zmmB);
  cc.evex().vpconflictq(xmmA, xmmB);
  cc.evex().vpconflictq(ymmA, ymmB);
  cc.evex().vpconflictq(zmmA, zmmB);
  cc.evex().vpermb(xmmA, xmmB, xmmC);
  cc.evex().vpermb(ymmA, ymmB, ymmC);
  cc.evex().vpermb(zmmA, zmmB, zmmC);
  cc.evex().vpermd(ymmA, ymmB, ymmC);
  cc.evex().vpermd(zmmA, zmmB, zmmC);
  cc.evex().vpermi2b(xmmA, xmmB, xmmC);
  cc.evex().vpermi2b(ymmA, ymmB, ymmC);
  cc.evex().vpermi2b(zmmA, zmmB, zmmC);
  cc.evex().vpermi2d(xmmA, xmmB, xmmC);
  cc.evex().vpermi2d(ymmA, ymmB, ymmC);
  cc.evex().vpermi2d(zmmA, zmmB, zmmC);
  cc.evex().vpermi2pd(xmmA, xmmB, xmmC);
  cc.evex().vpermi2pd(ymmA, ymmB, ymmC);
  cc.evex().vpermi2pd(zmmA, zmmB, zmmC);
  cc.evex().vpermi2ps(xmmA, xmmB, xmmC);
  cc.evex().vpermi2ps(ymmA, ymmB, ymmC);
  cc.evex().vpermi2ps(zmmA, zmmB, zmmC);
  cc.evex().vpermi2q(xmmA, xmmB, xmmC);
  cc.evex().vpermi2q(ymmA, ymmB, ymmC);
  cc.evex().vpermi2q(zmmA, zmmB, zmmC);
  cc.evex().vpermi2w(xmmA, xmmB, xmmC);
  cc.evex().vpermi2w(ymmA, ymmB, ymmC);
  cc.evex().vpermi2w(zmmA, zmmB, zmmC);
  cc.evex().vpermilpd(xmmA, xmmB, xmmC);
  cc.evex().vpermilpd(ymmA, ymmB, ymmC);
  cc.evex().vpermilpd(zmmA, zmmB, zmmC);
  cc.evex().vpermilpd(xmmA, xmmB, 0);
  cc.evex().vpermilpd(ymmA, ymmB, 0);
  cc.evex().vpermilpd(zmmA, zmmB, 0);
  cc.evex().vpermilps(xmmA, xmmB, xmmC);
  cc.evex().vpermilps(ymmA, ymmB, ymmC);
  cc.evex().vpermilps(zmmA, zmmB, zmmC);
  cc.evex().vpermilps(xmmA, xmmB, 0);
  cc.evex().vpermilps(ymmA, ymmB, 0);
  cc.evex().vpermilps(zmmA, zmmB, 0);
  cc.evex().vpermq(ymmA, ymmB, ymmC);
  cc.evex().vpermq(zmmA, zmmB, zmmC);
  cc.evex().vpermq(ymmA, ymmB, 0);
  cc.evex().vpermq(zmmA, zmmB, 0);
  cc.evex().vpermt2b(xmmA, xmmB, xmmC);
  cc.evex().vpermt2b(ymmA, ymmB, ymmC);
  cc.evex().vpermt2b(zmmA, zmmB, zmmC);
  cc.evex().vpermt2d(xmmA, xmmB, xmmC);
  cc.evex().vpermt2d(ymmA, ymmB, ymmC);
  cc.evex().vpermt2d(zmmA, zmmB, zmmC);
  cc.evex().vpermt2pd(xmmA, xmmB, xmmC);
  cc.evex().vpermt2pd(ymmA, ymmB, ymmC);
  cc.evex().vpermt2pd(zmmA, zmmB, zmmC);
  cc.evex().vpermt2ps(xmmA, xmmB, xmmC);
  cc.evex().vpermt2ps(ymmA, ymmB, ymmC);
  cc.evex().vpermt2ps(zmmA, zmmB, zmmC);
  cc.evex().vpermt2q(xmmA, xmmB, xmmC);
  cc.evex().vpermt2q(ymmA, ymmB, ymmC);
  cc.evex().vpermt2q(zmmA, zmmB, zmmC);
  cc.evex().vpermt2w(xmmA, xmmB, xmmC);
  cc.evex().vpermt2w(ymmA, ymmB, ymmC);
  cc.evex().vpermt2w(zmmA, zmmB, zmmC);
  cc.evex().vpermw(xmmA, xmmB, xmmC);
  cc.evex().vpermw(ymmA, ymmB, ymmC);
  cc.evex().vpermw(zmmA, zmmB, zmmC);
  cc.evex().vpexpandd(xmmA, xmmB);
  cc.evex().vpexpandd(ymmA, ymmB);
  cc.evex().vpexpandd(zmmA, zmmB);
  cc.evex().vpexpandq(xmmA, xmmB);
  cc.evex().vpexpandq(ymmA, ymmB);
  cc.evex().vpexpandq(zmmA, zmmB);
  cc.evex().vpextrb(gpd, xmmB, 0);
  cc.evex().vpextrd(gpd, xmmB, 0);
  if (cc.is64Bit()) cc.evex().vpextrq(gpq, xmmB, 0);
  cc.evex().vpextrw(gpd, xmmB, 0);
  cc.evex().vpinsrb(xmmA, xmmB, gpd, 0);
  cc.evex().vpinsrd(xmmA, xmmB, gpd, 0);
  if (cc.is64Bit()) cc.evex().vpinsrq(xmmA, xmmB, gpq, 0);
  cc.evex().vpinsrw(xmmA, xmmB, gpd, 0);
  cc.evex().vplzcntd(xmmA, xmmB);
  cc.evex().vplzcntd(ymmA, ymmB);
  cc.evex().vplzcntd(zmmA, zmmB);
  cc.evex().vplzcntq(xmmA, xmmB);
  cc.evex().vplzcntq(ymmA, ymmB);
  cc.evex().vplzcntq(zmmA, zmmB);
  cc.evex().vpmadd52huq(xmmA, xmmB, xmmC);
  cc.evex().vpmadd52huq(ymmA, ymmB, ymmC);
  cc.evex().vpmadd52huq(zmmA, zmmB, zmmC);
  cc.evex().vpmadd52luq(xmmA, xmmB, xmmC);
  cc.evex().vpmadd52luq(ymmA, ymmB, ymmC);
  cc.evex().vpmadd52luq(zmmA, zmmB, zmmC);
  cc.evex().vpmaddubsw(xmmA, xmmB, xmmC);
  cc.evex().vpmaddubsw(ymmA, ymmB, ymmC);
  cc.evex().vpmaddubsw(zmmA, zmmB, zmmC);
  cc.evex().vpmaddwd(xmmA, xmmB, xmmC);
  cc.evex().vpmaddwd(ymmA, ymmB, ymmC);
  cc.evex().vpmaddwd(zmmA, zmmB, zmmC);
  cc.evex().vpmaxsb(xmmA, xmmB, xmmC);
  cc.evex().vpmaxsb(ymmA, ymmB, ymmC);
  cc.evex().vpmaxsb(zmmA, zmmB, zmmC);
  cc.evex().vpmaxsd(xmmA, xmmB, xmmC);
  cc.evex().vpmaxsd(ymmA, ymmB, ymmC);
  cc.evex().vpmaxsd(zmmA, zmmB, zmmC);
  cc.evex().vpmaxsq(xmmA, xmmB, xmmC);
  cc.evex().vpmaxsq(ymmA, ymmB, ymmC);
  cc.evex().vpmaxsq(zmmA, zmmB, zmmC);
  cc.evex().vpmaxsw(xmmA, xmmB, xmmC);
  cc.evex().vpmaxsw(ymmA, ymmB, ymmC);
  cc.evex().vpmaxsw(zmmA, zmmB, zmmC);
  cc.evex().vpmaxub(xmmA, xmmB, xmmC);
  cc.evex().vpmaxub(ymmA, ymmB, ymmC);
  cc.evex().vpmaxub(zmmA, zmmB, zmmC);
  cc.evex().vpmaxud(xmmA, xmmB, xmmC);
  cc.evex().vpmaxud(ymmA, ymmB, ymmC);
  cc.evex().vpmaxud(zmmA, zmmB, zmmC);
  cc.evex().vpmaxuq(xmmA, xmmB, xmmC);
  cc.evex().vpmaxuq(ymmA, ymmB, ymmC);
  cc.evex().vpmaxuq(zmmA, zmmB, zmmC);
  cc.evex().vpmaxuw(xmmA, xmmB, xmmC);
  cc.evex().vpmaxuw(ymmA, ymmB, ymmC);
  cc.evex().vpmaxuw(zmmA, zmmB, zmmC);
  cc.evex().vpminsb(xmmA, xmmB, xmmC);
  cc.evex().vpminsb(ymmA, ymmB, ymmC);
  cc.evex().vpminsb(zmmA, zmmB, zmmC);
  cc.evex().vpminsd(xmmA, xmmB, xmmC);
  cc.evex().vpminsd(ymmA, ymmB, ymmC);
  cc.evex().vpminsd(zmmA, zmmB, zmmC);
  cc.evex().vpminsq(xmmA, xmmB, xmmC);
  cc.evex().vpminsq(ymmA, ymmB, ymmC);
  cc.evex().vpminsq(zmmA, zmmB, zmmC);
  cc.evex().vpminsw(xmmA, xmmB, xmmC);
  cc.evex().vpminsw(ymmA, ymmB, ymmC);
  cc.evex().vpminsw(zmmA, zmmB, zmmC);
  cc.evex().vpminub(xmmA, xmmB, xmmC);
  cc.evex().vpminub(ymmA, ymmB, ymmC);
  cc.evex().vpminub(zmmA, zmmB, zmmC);
  cc.evex().vpminud(xmmA, xmmB, xmmC);
  cc.evex().vpminud(ymmA, ymmB, ymmC);
  cc.evex().vpminud(zmmA, zmmB, zmmC);
  cc.evex().vpminuq(xmmA, xmmB, xmmC);
  cc.evex().vpminuq(ymmA, ymmB, ymmC);
  cc.evex().vpminuq(zmmA, zmmB, zmmC);
  cc.evex().vpminuw(xmmA, xmmB, xmmC);
  cc.evex().vpminuw(ymmA, ymmB, ymmC);
  cc.evex().vpminuw(zmmA, zmmB, zmmC);
  cc.evex().vpmovb2m(kA, xmmB);
  cc.evex().vpmovb2m(kA, ymmB);
  cc.evex().vpmovb2m(kA, zmmB);
  cc.evex().vpmovd2m(kA, xmmB);
  cc.evex().vpmovd2m(kA, ymmB);
  cc.evex().vpmovd2m(kA, zmmB);
  cc.evex().vpmovdb(xmmA, xmmB);
  cc.evex().vpmovdb(xmmA, ymmB);
  cc.evex().vpmovdb(xmmA, zmmB);
  cc.evex().vpmovdw(xmmA, xmmB);
  cc.evex().vpmovdw(xmmA, ymmB);
  cc.evex().vpmovdw(ymmA, zmmB);
  cc.evex().vpmovm2b(xmmA, kB);
  cc.evex().vpmovm2b(ymmA, kB);
  cc.evex().vpmovm2b(zmmA, kB);
  cc.evex().vpmovm2d(xmmA, kB);
  cc.evex().vpmovm2d(ymmA, kB);
  cc.evex().vpmovm2d(zmmA, kB);
  cc.evex().vpmovm2q(xmmA, kB);
  cc.evex().vpmovm2q(ymmA, kB);
  cc.evex().vpmovm2q(zmmA, kB);
  cc.evex().vpmovm2w(xmmA, kB);
  cc.evex().vpmovm2w(ymmA, kB);
  cc.evex().vpmovm2w(zmmA, kB);
  cc.evex().vpmovq2m(kA, xmmB);
  cc.evex().vpmovq2m(kA, ymmB);
  cc.evex().vpmovq2m(kA, zmmB);
  cc.evex().vpmovqb(xmmA, xmmB);
  cc.evex().vpmovqb(xmmA, ymmB);
  cc.evex().vpmovqb(xmmA, zmmB);
  cc.evex().vpmovqd(xmmA, xmmB);
  cc.evex().vpmovqd(xmmA, ymmB);
  cc.evex().vpmovqd(ymmA, zmmB);
  cc.evex().vpmovqw(xmmA, xmmB);
  cc.evex().vpmovqw(xmmA, ymmB);
  cc.evex().vpmovqw(xmmA, zmmB);
  cc.evex().vpmovsdb(xmmA, xmmB);
  cc.evex().vpmovsdb(xmmA, ymmB);
  cc.evex().vpmovsdb(xmmA, zmmB);
  cc.evex().vpmovsdw(xmmA, xmmB);
  cc.evex().vpmovsdw(xmmA, ymmB);
  cc.evex().vpmovsdw(ymmA, zmmB);
  cc.evex().vpmovsqb(xmmA, xmmB);
  cc.evex().vpmovsqb(xmmA, ymmB);
  cc.evex().vpmovsqb(xmmA, zmmB);
  cc.evex().vpmovsqd(xmmA, xmmB);
  cc.evex().vpmovsqd(xmmA, ymmB);
  cc.evex().vpmovsqd(ymmA, zmmB);
  cc.evex().vpmovsqw(xmmA, xmmB);
  cc.evex().vpmovsqw(xmmA, ymmB);
  cc.evex().vpmovsqw(xmmA, zmmB);
  cc.evex().vpmovswb(xmmA, xmmB);
  cc.evex().vpmovswb(xmmA, ymmB);
  cc.evex().vpmovswb(ymmA, zmmB);
  cc.evex().vpmovsxbd(xmmA, xmmB);
  cc.evex().vpmovsxbd(ymmA, xmmB);
  cc.evex().vpmovsxbd(zmmA, xmmB);
  cc.evex().vpmovsxbq(xmmA, xmmB);
  cc.evex().vpmovsxbq(ymmA, xmmB);
  cc.evex().vpmovsxbq(zmmA, xmmB);
  cc.evex().vpmovsxbw(xmmA, xmmB);
  cc.evex().vpmovsxbw(ymmA, xmmB);
  cc.evex().vpmovsxbw(zmmA, ymmB);
  cc.evex().vpmovsxdq(xmmA, xmmB);
  cc.evex().vpmovsxdq(ymmA, xmmB);
  cc.evex().vpmovsxdq(zmmA, ymmB);
  cc.evex().vpmovsxwd(xmmA, xmmB);
  cc.evex().vpmovsxwd(ymmA, xmmB);
  cc.evex().vpmovsxwd(zmmA, ymmB);
  cc.evex().vpmovsxwq(xmmA, xmmB);
  cc.evex().vpmovsxwq(ymmA, xmmB);
  cc.evex().vpmovsxwq(zmmA, xmmB);
  cc.evex().vpmovusdb(xmmA, xmmB);
  cc.evex().vpmovusdb(xmmA, ymmB);
  cc.evex().vpmovusdb(xmmA, zmmB);
  cc.evex().vpmovusdw(xmmA, xmmB);
  cc.evex().vpmovusdw(xmmA, ymmB);
  cc.evex().vpmovusdw(ymmA, zmmB);
  cc.evex().vpmovusqb(xmmA, xmmB);
  cc.evex().vpmovusqb(xmmA, ymmB);
  cc.evex().vpmovusqb(xmmA, zmmB);
  cc.evex().vpmovusqd(xmmA, xmmB);
  cc.evex().vpmovusqd(xmmA, ymmB);
  cc.evex().vpmovusqd(ymmA, zmmB);
  cc.evex().vpmovusqw(xmmA, xmmB);
  cc.evex().vpmovusqw(xmmA, ymmB);
  cc.evex().vpmovusqw(xmmA, zmmB);
  cc.evex().vpmovuswb(xmmA, xmmB);
  cc.evex().vpmovuswb(xmmA, ymmB);
  cc.evex().vpmovuswb(ymmA, zmmB);
  cc.evex().vpmovw2m(kA, xmmB);
  cc.evex().vpmovw2m(kA, ymmB);
  cc.evex().vpmovw2m(kA, zmmB);
  cc.evex().vpmovwb(xmmA, xmmB);
  cc.evex().vpmovwb(xmmA, ymmB);
  cc.evex().vpmovwb(ymmA, zmmB);
  cc.evex().vpmovzxbd(xmmA, xmmB);
  cc.evex().vpmovzxbd(ymmA, xmmB);
  cc.evex().vpmovzxbd(zmmA, xmmB);
  cc.evex().vpmovzxbq(xmmA, xmmB);
  cc.evex().vpmovzxbq(ymmA, xmmB);
  cc.evex().vpmovzxbq(zmmA, xmmB);
  cc.evex().vpmovzxbw(xmmA, xmmB);
  cc.evex().vpmovzxbw(ymmA, xmmB);
  cc.evex().vpmovzxbw(zmmA, ymmB);
  cc.evex().vpmovzxdq(xmmA, xmmB);
  cc.evex().vpmovzxdq(ymmA, xmmB);
  cc.evex().vpmovzxdq(zmmA, ymmB);
  cc.evex().vpmovzxwd(xmmA, xmmB);
  cc.evex().vpmovzxwd(ymmA, xmmB);
  cc.evex().vpmovzxwd(zmmA, ymmB);
  cc.evex().vpmovzxwq(xmmA, xmmB);
  cc.evex().vpmovzxwq(ymmA, xmmB);
  cc.evex().vpmovzxwq(zmmA, xmmB);
  cc.evex().vpmuldq(xmmA, xmmB, xmmC);
  cc.evex().vpmuldq(ymmA, ymmB, ymmC);
  cc.evex().vpmuldq(zmmA, zmmB, zmmC);
  cc.evex().vpmulhrsw(xmmA, xmmB, xmmC);
  cc.evex().vpmulhrsw(ymmA, ymmB, ymmC);
  cc.evex().vpmulhrsw(zmmA, zmmB, zmmC);
  cc.evex().vpmulhuw(xmmA, xmmB, xmmC);
  cc.evex().vpmulhuw(ymmA, ymmB, ymmC);
  cc.evex().vpmulhuw(zmmA, zmmB, zmmC);
  cc.evex().vpmulhw(xmmA, xmmB, xmmC);
  cc.evex().vpmulhw(ymmA, ymmB, ymmC);
  cc.evex().vpmulhw(zmmA, zmmB, zmmC);
  cc.evex().vpmulld(xmmA, xmmB, xmmC);
  cc.evex().vpmulld(ymmA, ymmB, ymmC);
  cc.evex().vpmulld(zmmA, zmmB, zmmC);
  cc.evex().vpmullq(xmmA, xmmB, xmmC);
  cc.evex().vpmullq(ymmA, ymmB, ymmC);
  cc.evex().vpmullq(zmmA, zmmB, zmmC);
  cc.evex().vpmullw(xmmA, xmmB, xmmC);
  cc.evex().vpmullw(ymmA, ymmB, ymmC);
  cc.evex().vpmullw(zmmA, zmmB, zmmC);
  cc.evex().vpmultishiftqb(xmmA, xmmB, xmmC);
  cc.evex().vpmultishiftqb(ymmA, ymmB, ymmC);
  cc.evex().vpmultishiftqb(zmmA, zmmB, zmmC);
  cc.evex().vpmuludq(xmmA, xmmB, xmmC);
  cc.evex().vpmuludq(ymmA, ymmB, ymmC);
  cc.evex().vpmuludq(zmmA, zmmB, zmmC);
  cc.evex().vpopcntd(zmmA, zmmB);
  cc.evex().vpopcntq(zmmA, zmmB);
  cc.evex().vpord(xmmA, xmmB, xmmC);
  cc.evex().vpord(ymmA, ymmB, ymmC);
  cc.evex().vpord(zmmA, zmmB, zmmC);
  cc.evex().vporq(xmmA, xmmB, xmmC);
  cc.evex().vporq(ymmA, ymmB, ymmC);
  cc.evex().vporq(zmmA, zmmB, zmmC);
  cc.evex().vprold(xmmA, xmmB, 0);
  cc.evex().vprold(ymmA, ymmB, 0);
  cc.evex().vprold(zmmA, zmmB, 0);
  cc.evex().vprolq(xmmA, xmmB, 0);
  cc.evex().vprolq(ymmA, ymmB, 0);
  cc.evex().vprolq(zmmA, zmmB, 0);
  cc.evex().vprolvd(xmmA, xmmB, xmmC);
  cc.evex().vprolvd(ymmA, ymmB, ymmC);
  cc.evex().vprolvd(zmmA, zmmB, zmmC);
  cc.evex().vprolvq(xmmA, xmmB, xmmC);
  cc.evex().vprolvq(ymmA, ymmB, ymmC);
  cc.evex().vprolvq(zmmA, zmmB, zmmC);
  cc.evex().vprord(xmmA, xmmB, 0);
  cc.evex().vprord(ymmA, ymmB, 0);
  cc.evex().vprord(zmmA, zmmB, 0);
  cc.evex().vprorq(xmmA, xmmB, 0);
  cc.evex().vprorq(ymmA, ymmB, 0);
  cc.evex().vprorq(zmmA, zmmB, 0);
  cc.evex().vprorvd(xmmA, xmmB, xmmC);
  cc.evex().vprorvd(ymmA, ymmB, ymmC);
  cc.evex().vprorvd(zmmA, zmmB, zmmC);
  cc.evex().vprorvq(xmmA, xmmB, xmmC);
  cc.evex().vprorvq(ymmA, ymmB, ymmC);
  cc.evex().vprorvq(zmmA, zmmB, zmmC);
  cc.evex().vpsadbw(xmmA, xmmB, xmmC);
  cc.evex().vpsadbw(ymmA, ymmB, ymmC);
  cc.evex().vpsadbw(zmmA, zmmB, zmmC);
  cc.evex().vpshufb(xmmA, xmmB, xmmC);
  cc.evex().vpshufb(ymmA, ymmB, ymmC);
  cc.evex().vpshufb(zmmA, zmmB, zmmC);
  cc.evex().vpshufd(xmmA, xmmB, 0);
  cc.evex().vpshufd(ymmA, ymmB, 0);
  cc.evex().vpshufd(zmmA, zmmB, 0);
  cc.evex().vpshufhw(xmmA, xmmB, 0);
  cc.evex().vpshufhw(ymmA, ymmB, 0);
  cc.evex().vpshufhw(zmmA, zmmB, 0);
  cc.evex().vpshuflw(xmmA, xmmB, 0);
  cc.evex().vpshuflw(ymmA, ymmB, 0);
  cc.evex().vpshuflw(zmmA, zmmB, 0);
  cc.evex().vpslld(xmmA, xmmB, xmmC);
  cc.evex().vpslld(xmmA, xmmB, 0);
  cc.evex().vpslld(ymmA, ymmB, xmmC);
  cc.evex().vpslld(ymmA, ymmB, 0);
  cc.evex().vpslld(zmmA, zmmB, xmmC);
  cc.evex().vpslld(zmmA, zmmB, 0);
  cc.evex().vpslldq(xmmA, xmmB, 0);
  cc.evex().vpslldq(ymmA, ymmB, 0);
  cc.evex().vpslldq(zmmA, zmmB, 0);
  cc.evex().vpsllq(xmmA, xmmB, xmmC);
  cc.evex().vpsllq(xmmA, xmmB, 0);
  cc.evex().vpsllq(ymmA, ymmB, xmmC);
  cc.evex().vpsllq(ymmA, ymmB, 0);
  cc.evex().vpsllq(zmmA, zmmB, xmmC);
  cc.evex().vpsllq(zmmA, zmmB, 0);
  cc.evex().vpsllvd(xmmA, xmmB, xmmC);
  cc.evex().vpsllvd(ymmA, ymmB, ymmC);
  cc.evex().vpsllvd(zmmA, zmmB, zmmC);
  cc.evex().vpsllvq(xmmA, xmmB, xmmC);
  cc.evex().vpsllvq(ymmA, ymmB, ymmC);
  cc.evex().vpsllvq(zmmA, zmmB, zmmC);
  cc.evex().vpsllvw(xmmA, xmmB, xmmC);
  cc.evex().vpsllvw(ymmA, ymmB, ymmC);
  cc.evex().vpsllvw(zmmA, zmmB, zmmC);
  cc.evex().vpsllw(xmmA, xmmB, xmmC);
  cc.evex().vpsllw(xmmA, xmmB, 0);
  cc.evex().vpsllw(ymmA, ymmB, xmmC);
  cc.evex().vpsllw(ymmA, ymmB, 0);
  cc.evex().vpsllw(zmmA, zmmB, xmmC);
  cc.evex().vpsllw(zmmA, zmmB, 0);
  cc.evex().vpsrad(xmmA, xmmB, xmmC);
  cc.evex().vpsrad(xmmA, xmmB, 0);
  cc.evex().vpsrad(ymmA, ymmB, xmmC);
  cc.evex().vpsrad(ymmA, ymmB, 0);
  cc.evex().vpsrad(zmmA, zmmB, xmmC);
  cc.evex().vpsrad(zmmA, zmmB, 0);
  cc.evex().vpsraq(xmmA, xmmB, xmmC);
  cc.evex().vpsraq(xmmA, xmmB, 0);
  cc.evex().vpsraq(ymmA, ymmB, xmmC);
  cc.evex().vpsraq(ymmA, ymmB, 0);
  cc.evex().vpsraq(zmmA, zmmB, xmmC);
  cc.evex().vpsraq(zmmA, zmmB, 0);
  cc.evex().vpsravd(xmmA, xmmB, xmmC);
  cc.evex().vpsravd(ymmA, ymmB, ymmC);
  cc.evex().vpsravd(zmmA, zmmB, zmmC);
  cc.evex().vpsravq(xmmA, xmmB, xmmC);
  cc.evex().vpsravq(ymmA, ymmB, ymmC);
  cc.evex().vpsravq(zmmA, zmmB, zmmC);
  cc.evex().vpsravw(xmmA, xmmB, xmmC);
  cc.evex().vpsravw(ymmA, ymmB, ymmC);
  cc.evex().vpsravw(zmmA, zmmB, zmmC);
  cc.evex().vpsraw(xmmA, xmmB, xmmC);
  cc.evex().vpsraw(xmmA, xmmB, 0);
  cc.evex().vpsraw(ymmA, ymmB, xmmC);
  cc.evex().vpsraw(ymmA, ymmB, 0);
  cc.evex().vpsraw(zmmA, zmmB, xmmC);
  cc.evex().vpsraw(zmmA, zmmB, 0);
  cc.evex().vpsrld(xmmA, xmmB, xmmC);
  cc.evex().vpsrld(xmmA, xmmB, 0);
  cc.evex().vpsrld(ymmA, ymmB, xmmC);
  cc.evex().vpsrld(ymmA, ymmB, 0);
  cc.evex().vpsrld(zmmA, zmmB, xmmC);
  cc.evex().vpsrld(zmmA, zmmB, 0);
  cc.evex().vpsrldq(xmmA, xmmB, 0);
  cc.evex().vpsrldq(ymmA, ymmB, 0);
  cc.evex().vpsrldq(zmmA, zmmB, 0);
  cc.evex().vpsrlq(xmmA, xmmB, xmmC);
  cc.evex().vpsrlq(xmmA, xmmB, 0);
  cc.evex().vpsrlq(ymmA, ymmB, xmmC);
  cc.evex().vpsrlq(ymmA, ymmB, 0);
  cc.evex().vpsrlq(zmmA, zmmB, xmmC);
  cc.evex().vpsrlq(zmmA, zmmB, 0);
  cc.evex().vpsrlvd(xmmA, xmmB, xmmC);
  cc.evex().vpsrlvd(ymmA, ymmB, ymmC);
  cc.evex().vpsrlvd(zmmA, zmmB, zmmC);
  cc.evex().vpsrlvq(xmmA, xmmB, xmmC);
  cc.evex().vpsrlvq(ymmA, ymmB, ymmC);
  cc.evex().vpsrlvq(zmmA, zmmB, zmmC);
  cc.evex().vpsrlvw(xmmA, xmmB, xmmC);
  cc.evex().vpsrlvw(ymmA, ymmB, ymmC);
  cc.evex().vpsrlvw(zmmA, zmmB, zmmC);
  cc.evex().vpsrlw(xmmA, xmmB, xmmC);
  cc.evex().vpsrlw(xmmA, xmmB, 0);
  cc.evex().vpsrlw(ymmA, ymmB, xmmC);
  cc.evex().vpsrlw(ymmA, ymmB, 0);
  cc.evex().vpsrlw(zmmA, zmmB, xmmC);
  cc.evex().vpsrlw(zmmA, zmmB, 0);
  cc.evex().vpsubb(xmmA, xmmB, xmmC);
  cc.evex().vpsubb(ymmA, ymmB, ymmC);
  cc.evex().vpsubb(zmmA, zmmB, zmmC);
  cc.evex().vpsubd(xmmA, xmmB, xmmC);
  cc.evex().vpsubd(ymmA, ymmB, ymmC);
  cc.evex().vpsubd(zmmA, zmmB, zmmC);
  cc.evex().vpsubq(xmmA, xmmB, xmmC);
  cc.evex().vpsubq(ymmA, ymmB, ymmC);
  cc.evex().vpsubq(zmmA, zmmB, zmmC);
  cc.evex().vpsubsb(xmmA, xmmB, xmmC);
  cc.evex().vpsubsb(ymmA, ymmB, ymmC);
  cc.evex().vpsubsb(zmmA, zmmB, zmmC);
  cc.evex().vpsubsw(xmmA, xmmB, xmmC);
  cc.evex().vpsubsw(ymmA, ymmB, ymmC);
  cc.evex().vpsubsw(zmmA, zmmB, zmmC);
  cc.evex().vpsubusb(xmmA, xmmB, xmmC);
  cc.evex().vpsubusb(ymmA, ymmB, ymmC);
  cc.evex().vpsubusb(zmmA, zmmB, zmmC);
  cc.evex().vpsubusw(xmmA, xmmB, xmmC);
  cc.evex().vpsubusw(ymmA, ymmB, ymmC);
  cc.evex().vpsubusw(zmmA, zmmB, zmmC);
  cc.evex().vpsubw(xmmA, xmmB, xmmC);
  cc.evex().vpsubw(ymmA, ymmB, ymmC);
  cc.evex().vpsubw(zmmA, zmmB, zmmC);
  cc.evex().vpternlogd(xmmA, xmmB, xmmC, 0);
  cc.evex().vpternlogd(ymmA, ymmB, ymmC, 0);
  cc.evex().vpternlogd(zmmA, zmmB, zmmC, 0);
  cc.evex().vpternlogq(xmmA, xmmB, xmmC, 0);
  cc.evex().vpternlogq(ymmA, ymmB, ymmC, 0);
  cc.evex().vpternlogq(zmmA, zmmB, zmmC, 0);
  cc.evex().vptestmb(kA, xmmB, xmmC);
  cc.evex().vptestmb(kA, ymmB, ymmC);
  cc.evex().vptestmb(kA, zmmB, zmmC);
  cc.evex().vptestmd(kA, xmmB, xmmC);
  cc.evex().vptestmd(kA, ymmB, ymmC);
  cc.evex().vptestmd(kA, zmmB, zmmC);
  cc.evex().vptestmq(kA, xmmB, xmmC);
  cc.evex().vptestmq(kA, ymmB, ymmC);
  cc.evex().vptestmq(kA, zmmB, zmmC);
  cc.evex().vptestmw(kA, xmmB, xmmC);
  cc.evex().vptestmw(kA, ymmB, ymmC);
  cc.evex().vptestmw(kA, zmmB, zmmC);
  cc.evex().vptestnmb(kA, xmmB, xmmC);
  cc.evex().vptestnmb(kA, ymmB, ymmC);
  cc.evex().vptestnmb(kA, zmmB, zmmC);
  cc.evex().vptestnmd(kA, xmmB, xmmC);
  cc.evex().vptestnmd(kA, ymmB, ymmC);
  cc.evex().vptestnmd(kA, zmmB, zmmC);
  cc.evex().vptestnmq(kA, xmmB, xmmC);
  cc.evex().vptestnmq(kA, ymmB, ymmC);
  cc.evex().vptestnmq(kA, zmmB, zmmC);
  cc.evex().vptestnmw(kA, xmmB, xmmC);
  cc.evex().vptestnmw(kA, ymmB, ymmC);
  cc.evex().vptestnmw(kA, zmmB, zmmC);
  cc.evex().vpunpckhbw(xmmA, xmmB, xmmC);
  cc.evex().vpunpckhbw(ymmA, ymmB, ymmC);
  cc.evex().vpunpckhbw(zmmA, zmmB, zmmC);
  cc.evex().vpunpckhdq(xmmA, xmmB, xmmC);
  cc.evex().vpunpckhdq(ymmA, ymmB, ymmC);
  cc.evex().vpunpckhdq(zmmA, zmmB, zmmC);
  cc.evex().vpunpckhqdq(xmmA, xmmB, xmmC);
  cc.evex().vpunpckhqdq(ymmA, ymmB, ymmC);
  cc.evex().vpunpckhqdq(zmmA, zmmB, zmmC);
  cc.evex().vpunpckhwd(xmmA, xmmB, xmmC);
  cc.evex().vpunpckhwd(ymmA, ymmB, ymmC);
  cc.evex().vpunpckhwd(zmmA, zmmB, zmmC);
  cc.evex().vpunpcklbw(xmmA, xmmB, xmmC);
  cc.evex().vpunpcklbw(ymmA, ymmB, ymmC);
  cc.evex().vpunpcklbw(zmmA, zmmB, zmmC);
  cc.evex().vpunpckldq(xmmA, xmmB, xmmC);
  cc.evex().vpunpckldq(ymmA, ymmB, ymmC);
  cc.evex().vpunpckldq(zmmA, zmmB, zmmC);
  cc.evex().vpunpcklqdq(xmmA, xmmB, xmmC);
  cc.evex().vpunpcklqdq(ymmA, ymmB, ymmC);
  cc.evex().vpunpcklqdq(zmmA, zmmB, zmmC);
  cc.evex().vpunpcklwd(xmmA, xmmB, xmmC);
  cc.evex().vpunpcklwd(ymmA, ymmB, ymmC);
  cc.evex().vpunpcklwd(zmmA, zmmB, zmmC);
  cc.evex().vpxord(xmmA, xmmB, xmmC);
  cc.evex().vpxord(ymmA, ymmB, ymmC);
  cc.evex().vpxord(zmmA, zmmB, zmmC);
  cc.evex().vpxorq(xmmA, xmmB, xmmC);
  cc.evex().vpxorq(ymmA, ymmB, ymmC);
  cc.evex().vpxorq(zmmA, zmmB, zmmC);
  cc.evex().vrangepd(xmmA, xmmB, xmmC, 0);
  cc.evex().vrangepd(ymmA, ymmB, ymmC, 0);
  cc.evex().vrangepd(zmmA, zmmB, zmmC, 0);
  cc.evex().vrangeps(xmmA, xmmB, xmmC, 0);
  cc.evex().vrangeps(ymmA, ymmB, ymmC, 0);
  cc.evex().vrangeps(zmmA, zmmB, zmmC, 0);
  cc.evex().vrangesd(xmmA, xmmB, xmmC, 0);
  cc.evex().vrangess(xmmA, xmmB, xmmC, 0);
  cc.evex().vrcp14pd(xmmA, xmmB);
  cc.evex().vrcp14pd(ymmA, ymmB);
  cc.evex().vrcp14pd(zmmA, zmmB);
  cc.evex().vrcp14ps(xmmA, xmmB);
  cc.evex().vrcp14ps(ymmA, ymmB);
  cc.evex().vrcp14ps(zmmA, zmmB);
  cc.evex().vrcp14sd(xmmA, xmmB, xmmC);
  cc.evex().vrcp14ss(xmmA, xmmB, xmmC);
  cc.evex().vrcp28pd(zmmA, zmmB);
  cc.evex().vrcp28ps(zmmA, zmmB);
  cc.evex().vrcp28sd(xmmA, xmmB, xmmC);
  cc.evex().vrcp28ss(xmmA, xmmB, xmmC);
  cc.evex().vreducepd(xmmA, xmmB, 0);
  cc.evex().vreducepd(ymmA, ymmB, 0);
  cc.evex().vreducepd(zmmA, zmmB, 0);
  cc.evex().vreduceps(xmmA, xmmB, 0);
  cc.evex().vreduceps(ymmA, ymmB, 0);
  cc.evex().vreduceps(zmmA, zmmB, 0);
  cc.evex().vreducesd(xmmA, xmmB, xmmC, 0);
  cc.evex().vreducess(xmmA, xmmB, xmmC, 0);
  cc.evex().vrndscalepd(xmmA, xmmB, 0);
  cc.evex().vrndscalepd(ymmA, ymmB, 0);
  cc.evex().vrndscalepd(zmmA, zmmB, 0);
  cc.evex().vrndscaleps(xmmA, xmmB, 0);
  cc.evex().vrndscaleps(ymmA, ymmB, 0);
  cc.evex().vrndscaleps(zmmA, zmmB, 0);
  cc.evex().vrndscalesd(xmmA, xmmB, xmmC, 0);
  cc.evex().vrndscaless(xmmA, xmmB, xmmC, 0);
  cc.evex().vrsqrt14pd(xmmA, xmmB);
  cc.evex().vrsqrt14pd(ymmA, ymmB);
  cc.evex().vrsqrt14pd(zmmA, zmmB);
  cc.evex().vrsqrt14ps(xmmA, xmmB);
  cc.evex().vrsqrt14ps(ymmA, ymmB);
  cc.evex().vrsqrt14ps(zmmA, zmmB);
  cc.evex().vrsqrt14sd(xmmA, xmmB, xmmC);
  cc.evex().vrsqrt14ss(xmmA, xmmB, xmmC);
  cc.evex().vrsqrt28pd(zmmA, zmmB);
  cc.evex().vrsqrt28ps(zmmA, zmmB);
  cc.evex().vrsqrt28sd(xmmA, xmmB, xmmC);
  cc.evex().vrsqrt28ss(xmmA, xmmB, xmmC);
  cc.evex().vscalefpd(xmmA, xmmB, xmmC);
  cc.evex().vscalefpd(ymmA, ymmB, ymmC);
  cc.evex().vscalefpd(zmmA, zmmB, zmmC);
  cc.evex().vscalefps(xmmA, xmmB, xmmC);
  cc.evex().vscalefps(ymmA, ymmB, ymmC);
  cc.evex().vscalefps(zmmA, zmmB, zmmC);
  cc.evex().vscalefsd(xmmA, xmmB, xmmC);
  cc.evex().vscalefss(xmmA, xmmB, xmmC);
  cc.evex().vshuff32x4(ymmA, ymmB, ymmC, 0);
  cc.evex().vshuff32x4(zmmA, zmmB, zmmC, 0);
  cc.evex().vshuff64x2(ymmA, ymmB, ymmC, 0);
  cc.evex().vshuff64x2(zmmA, zmmB, zmmC, 0);
  cc.evex().vshufi32x4(ymmA, ymmB, ymmC, 0);
  cc.evex().vshufi32x4(zmmA, zmmB, zmmC, 0);
  cc.evex().vshufi64x2(ymmA, ymmB, ymmC, 0);
  cc.evex().vshufi64x2(zmmA, zmmB, zmmC, 0);
  cc.evex().vshufpd(xmmA, xmmB, xmmC, 0);
  cc.evex().vshufpd(ymmA, ymmB, ymmC, 0);
  cc.evex().vshufpd(zmmA, zmmB, zmmC, 0);
  cc.evex().vshufps(xmmA, xmmB, xmmC, 0);
  cc.evex().vshufps(ymmA, ymmB, ymmC, 0);
  cc.evex().vshufps(zmmA, zmmB, zmmC, 0);
  cc.evex().vsqrtpd(xmmA, xmmB);
  cc.evex().vsqrtpd(ymmA, ymmB);
  cc.evex().vsqrtpd(zmmA, zmmB);
  cc.evex().vsqrtps(xmmA, xmmB);
  cc.evex().vsqrtps(ymmA, ymmB);
  cc.evex().vsqrtps(zmmA, zmmB);
  cc.evex().vsqrtsd(xmmA, xmmB, xmmC);
  cc.evex().vsqrtss(xmmA, xmmB, xmmC);
  cc.evex().vsubpd(xmmA, xmmB, xmmC);
  cc.evex().vsubpd(ymmA, ymmB, ymmC);
  cc.evex().vsubpd(zmmA, zmmB, zmmC);
  cc.evex().vsubps(xmmA, xmmB, xmmC);
  cc.evex().vsubps(ymmA, ymmB, ymmC);
  cc.evex().vsubps(zmmA, zmmB, zmmC);
  cc.evex().vsubsd(xmmA, xmmB, xmmC);
  cc.evex().vsubss(xmmA, xmmB, xmmC);
  cc.evex().vucomisd(xmmA, xmmB);
  cc.evex().vucomiss(xmmA, xmmB);
  cc.evex().vunpckhpd(xmmA, xmmB, xmmC);
  cc.evex().vunpckhpd(ymmA, ymmB, ymmC);
  cc.evex().vunpckhpd(zmmA, zmmB, zmmC);
  cc.evex().vunpckhps(xmmA, xmmB, xmmC);
  cc.evex().vunpckhps(ymmA, ymmB, ymmC);
  cc.evex().vunpckhps(zmmA, zmmB, zmmC);
  cc.evex().vunpcklpd(xmmA, xmmB, xmmC);
  cc.evex().vunpcklpd(ymmA, ymmB, ymmC);
  cc.evex().vunpcklpd(zmmA, zmmB, zmmC);
  cc.evex().vunpcklps(xmmA, xmmB, xmmC);
  cc.evex().vunpcklps(ymmA, ymmB, ymmC);
  cc.evex().vunpcklps(zmmA, zmmB, zmmC);
  cc.evex().vxorpd(xmmA, xmmB, xmmC);
  cc.evex().vxorpd(ymmA, ymmB, ymmC);
  cc.evex().vxorpd(zmmA, zmmB, zmmC);
  cc.evex().vxorps(xmmA, xmmB, xmmC);
  cc.evex().vxorps(ymmA, ymmB, ymmC);
  cc.evex().vxorps(zmmA, zmmB, zmmC);
}

template<typename Emitter>
static void generateAvx512SequenceInternalRegMem(
  Emitter& cc,
  const x86::Gp& gp,
  const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {

  DebugUtils::unused(kC);

  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is32Bit() ? gpd : gpq;

  x86::Xmm xmmA = vecA.xmm();
  x86::Xmm xmmB = vecB.xmm();
  x86::Xmm xmmC = vecC.xmm();
  x86::Xmm xmmD = vecD.xmm();

  x86::Ymm ymmA = vecA.ymm();
  x86::Ymm ymmB = vecB.ymm();
  x86::Ymm ymmD = vecD.ymm();

  x86::Zmm zmmA = vecA.zmm();
  x86::Zmm zmmB = vecB.zmm();
  x86::Zmm zmmD = vecD.zmm();

  x86::Mem m = x86::ptr(gpz);
  x86::Mem m32 = x86::dword_ptr(gpz);
  x86::Mem m64 = x86::qword_ptr(gpz);
  x86::Mem m128 = x86::xmmword_ptr(gpz);
  x86::Mem m256 = x86::ymmword_ptr(gpz);
  x86::Mem m512 = x86::zmmword_ptr(gpz);
  x86::Mem vx_ptr = x86::ptr(gpz, xmmD);
  x86::Mem vy_ptr = x86::ptr(gpz, ymmD);
  x86::Mem vz_ptr = x86::ptr(gpz, zmmD);

  cc.xor_(gpd, gpd);
  cc.vxorps(xmmA, xmmA, xmmA);
  cc.vxorps(xmmB, xmmB, xmmB);
  cc.vxorps(xmmC, xmmC, xmmC);
  cc.vxorps(xmmD, xmmD, xmmD);

  cc.kmovb(kA, m);
  cc.kmovb(m, kB);
  cc.kmovd(kA, m);
  cc.kmovd(m, kB);
  cc.kmovq(kA, m);
  cc.kmovq(m, kB);
  cc.kmovw(kA, m);
  cc.kmovw(m, kB);

  cc.evex().vaddpd(xmmA, xmmB, m);
  cc.evex().vaddpd(ymmA, ymmB, m);
  cc.evex().vaddpd(zmmA, zmmB, m);
  cc.evex().vaddps(xmmA, xmmB, m);
  cc.evex().vaddps(ymmA, ymmB, m);
  cc.evex().vaddps(zmmA, zmmB, m);
  cc.evex().vaddsd(xmmA, xmmB, m);
  cc.evex().vaddss(xmmA, xmmB, m);
  cc.evex().valignd(xmmA, xmmB, m, 0);
  cc.evex().valignd(ymmA, ymmB, m, 0);
  cc.evex().valignd(zmmA, zmmB, m, 0);
  cc.evex().valignq(xmmA, xmmB, m, 0);
  cc.evex().valignq(ymmA, ymmB, m, 0);
  cc.evex().valignq(zmmA, zmmB, m, 0);
  cc.evex().vandnpd(xmmA, xmmB, m);
  cc.evex().vandnpd(ymmA, ymmB, m);
  cc.evex().vandnpd(zmmA, zmmB, m);
  cc.evex().vandnps(xmmA, xmmB, m);
  cc.evex().vandnps(ymmA, ymmB, m);
  cc.evex().vandnps(zmmA, zmmB, m);
  cc.evex().vandpd(xmmA, xmmB, m);
  cc.evex().vandpd(ymmA, ymmB, m);
  cc.evex().vandpd(zmmA, zmmB, m);
  cc.evex().vandps(xmmA, xmmB, m);
  cc.evex().vandps(ymmA, ymmB, m);
  cc.evex().vandps(zmmA, zmmB, m);
  cc.evex().vblendmpd(xmmA, xmmB, m);
  cc.evex().vblendmpd(ymmA, ymmB, m);
  cc.evex().vblendmpd(zmmA, zmmB, m);
  cc.evex().vblendmps(xmmA, xmmB, m);
  cc.evex().vblendmps(ymmA, ymmB, m);
  cc.evex().vblendmps(zmmA, zmmB, m);
  cc.evex().vbroadcastf32x2(ymmA, m);
  cc.evex().vbroadcastf32x2(zmmA, m);
  cc.evex().vbroadcastf32x4(ymmA, m);
  cc.evex().vbroadcastf32x4(zmmA, m);
  cc.evex().vbroadcastf32x8(zmmA, m);
  cc.evex().vbroadcastf64x2(ymmA, m);
  cc.evex().vbroadcastf64x2(zmmA, m);
  cc.evex().vbroadcastf64x4(zmmA, m);
  cc.evex().vbroadcasti32x2(xmmA, m);
  cc.evex().vbroadcasti32x2(ymmA, m);
  cc.evex().vbroadcasti32x2(zmmA, m);
  cc.evex().vbroadcasti32x4(ymmA, m);
  cc.evex().vbroadcasti32x4(zmmA, m);
  cc.evex().vbroadcasti32x8(zmmA, m);
  cc.evex().vbroadcasti64x2(ymmA, m);
  cc.evex().vbroadcasti64x2(zmmA, m);
  cc.evex().vbroadcasti64x4(zmmA, m);
  cc.evex().vbroadcastsd(ymmA, m);
  cc.evex().vbroadcastsd(zmmA, m);
  cc.evex().vbroadcastss(xmmA, m);
  cc.evex().vbroadcastss(ymmA, m);
  cc.evex().vbroadcastss(zmmA, m);
  cc.evex().vcmppd(kA, xmmB, m, 0);
  cc.evex().vcmppd(kA, ymmB, m, 0);
  cc.evex().vcmppd(kA, zmmB, m, 0);
  cc.evex().vcmpps(kA, xmmB, m, 0);
  cc.evex().vcmpps(kA, ymmB, m, 0);
  cc.evex().vcmpps(kA, zmmB, m, 0);
  cc.evex().vcmpsd(kA, xmmB, m, 0);
  cc.evex().vcmpss(kA, xmmB, m, 0);
  cc.evex().vcomisd(xmmA, m);
  cc.evex().vcomiss(xmmA, m);
  cc.evex().vcompresspd(m, xmmB);
  cc.evex().vcompresspd(m, ymmB);
  cc.evex().vcompresspd(m, zmmB);
  cc.evex().vcompressps(m, xmmB);
  cc.evex().vcompressps(m, ymmB);
  cc.evex().vcompressps(m, zmmB);
  cc.evex().vcvtdq2pd(xmmA, m);
  cc.evex().vcvtdq2pd(ymmA, m);
  cc.evex().vcvtdq2pd(zmmA, m);
  cc.evex().vcvtdq2ps(xmmA, m);
  cc.evex().vcvtdq2ps(ymmA, m);
  cc.evex().vcvtdq2ps(zmmA, m);
  cc.evex().vcvtpd2dq(xmmA, m128);
  cc.evex().vcvtpd2dq(xmmA, m256);
  cc.evex().vcvtpd2dq(ymmA, m512);
  cc.evex().vcvtpd2qq(xmmA, m);
  cc.evex().vcvtpd2qq(ymmA, m);
  cc.evex().vcvtpd2qq(zmmA, m);
  cc.evex().vcvtpd2udq(xmmA, m128);
  cc.evex().vcvtpd2udq(xmmA, m256);
  cc.evex().vcvtpd2udq(ymmA, m512);
  cc.evex().vcvtpd2uqq(xmmA, m);
  cc.evex().vcvtpd2uqq(ymmA, m);
  cc.evex().vcvtpd2uqq(zmmA, m);
  cc.evex().vcvtph2ps(xmmA, m);
  cc.evex().vcvtph2ps(ymmA, m);
  cc.evex().vcvtph2ps(zmmA, m);
  cc.evex().vcvtps2dq(xmmA, m);
  cc.evex().vcvtps2dq(ymmA, m);
  cc.evex().vcvtps2dq(zmmA, m);
  cc.evex().vcvtps2pd(xmmA, m);
  cc.evex().vcvtps2pd(ymmA, m);
  cc.evex().vcvtps2pd(zmmA, m);
  cc.evex().vcvtps2ph(m, xmmB, 0);
  cc.evex().vcvtps2ph(m, ymmB, 0);
  cc.evex().vcvtps2ph(m, zmmB, 0);
  cc.evex().vcvtps2qq(xmmA, m);
  cc.evex().vcvtps2qq(ymmA, m);
  cc.evex().vcvtps2qq(zmmA, m);
  cc.evex().vcvtps2udq(xmmA, m);
  cc.evex().vcvtps2udq(ymmA, m);
  cc.evex().vcvtps2udq(zmmA, m);
  cc.evex().vcvtps2uqq(xmmA, m);
  cc.evex().vcvtps2uqq(ymmA, m);
  cc.evex().vcvtps2uqq(zmmA, m);
  cc.evex().vcvtqq2pd(xmmA, m);
  cc.evex().vcvtqq2pd(ymmA, m);
  cc.evex().vcvtqq2pd(zmmA, m);
  cc.evex().vcvtqq2ps(xmmA, m128);
  cc.evex().vcvtqq2ps(xmmA, m256);
  cc.evex().vcvtqq2ps(ymmA, m512);
  cc.evex().vcvtsd2si(gpd, m);
  cc.evex().vcvtsd2si(gpz, m);
  cc.evex().vcvtsd2ss(xmmA, xmmB, m);
  cc.evex().vcvtsd2usi(gpd, m);
  cc.evex().vcvtsd2usi(gpz, m);
  cc.evex().vcvtsi2sd(xmmA, xmmB, m32);
  if (cc.is64Bit()) cc.evex().vcvtsi2sd(xmmA, xmmB, m64);
  cc.evex().vcvtsi2ss(xmmA, xmmB, m32);
  if (cc.is64Bit()) cc.evex().vcvtsi2ss(xmmA, xmmB, m64);
  cc.evex().vcvtss2sd(xmmA, xmmB, m);
  cc.evex().vcvtss2si(gpd, m);
  cc.evex().vcvtss2si(gpz, m);
  cc.evex().vcvtss2usi(gpd, m);
  cc.evex().vcvtss2usi(gpz, m);
  cc.evex().vcvttpd2dq(xmmA, m128);
  cc.evex().vcvttpd2dq(xmmA, m256);
  cc.evex().vcvttpd2dq(ymmA, m512);
  cc.evex().vcvttpd2qq(xmmA, m);
  cc.evex().vcvttpd2qq(ymmA, m);
  cc.evex().vcvttpd2qq(zmmA, m);
  cc.evex().vcvttpd2udq(xmmA, m128);
  cc.evex().vcvttpd2udq(xmmA, m256);
  cc.evex().vcvttpd2udq(ymmA, m512);
  cc.evex().vcvttpd2uqq(xmmA, m);
  cc.evex().vcvttpd2uqq(ymmA, m);
  cc.evex().vcvttpd2uqq(zmmA, m);
  cc.evex().vcvttps2dq(xmmA, m);
  cc.evex().vcvttps2dq(ymmA, m);
  cc.evex().vcvttps2dq(zmmA, m);
  cc.evex().vcvttps2qq(xmmA, m);
  cc.evex().vcvttps2qq(ymmA, m);
  cc.evex().vcvttps2qq(zmmA, m);
  cc.evex().vcvttps2udq(xmmA, m);
  cc.evex().vcvttps2udq(ymmA, m);
  cc.evex().vcvttps2udq(zmmA, m);
  cc.evex().vcvttps2uqq(xmmA, m);
  cc.evex().vcvttps2uqq(ymmA, m);
  cc.evex().vcvttps2uqq(zmmA, m);
  cc.evex().vcvttsd2si(gpd, m);
  cc.evex().vcvttsd2si(gpz, m);
  cc.evex().vcvttsd2usi(gpd, m);
  cc.evex().vcvttsd2usi(gpz, m);
  cc.evex().vcvttss2si(gpd, m);
  cc.evex().vcvttss2si(gpz, m);
  cc.evex().vcvttss2usi(gpd, m);
  cc.evex().vcvttss2usi(gpz, m);
  cc.evex().vcvtudq2pd(xmmA, m);
  cc.evex().vcvtudq2pd(ymmA, m);
  cc.evex().vcvtudq2pd(zmmA, m);
  cc.evex().vcvtudq2ps(xmmA, m);
  cc.evex().vcvtudq2ps(ymmA, m);
  cc.evex().vcvtudq2ps(zmmA, m);
  cc.evex().vcvtuqq2pd(xmmA, m);
  cc.evex().vcvtuqq2pd(ymmA, m);
  cc.evex().vcvtuqq2pd(zmmA, m);
  cc.evex().vcvtuqq2ps(xmmA, m128);
  cc.evex().vcvtuqq2ps(xmmA, m256);
  cc.evex().vcvtuqq2ps(ymmA, m512);
  cc.evex().vcvtusi2sd(xmmA, xmmB, m32);
  if (cc.is64Bit()) cc.evex().vcvtusi2sd(xmmA, xmmB, m64);
  cc.evex().vcvtusi2ss(xmmA, xmmB, m32);
  if (cc.is64Bit()) cc.evex().vcvtusi2ss(xmmA, xmmB, m64);
  cc.evex().vdbpsadbw(xmmA, xmmB, m, 0);
  cc.evex().vdbpsadbw(ymmA, ymmB, m, 0);
  cc.evex().vdbpsadbw(zmmA, zmmB, m, 0);
  cc.evex().vdivpd(xmmA, xmmB, m);
  cc.evex().vdivpd(ymmA, ymmB, m);
  cc.evex().vdivpd(zmmA, zmmB, m);
  cc.evex().vdivps(xmmA, xmmB, m);
  cc.evex().vdivps(ymmA, ymmB, m);
  cc.evex().vdivps(zmmA, zmmB, m);
  cc.evex().vdivsd(xmmA, xmmB, m);
  cc.evex().vdivss(xmmA, xmmB, m);
  cc.evex().vexp2pd(zmmA, m);
  cc.evex().vexp2ps(zmmA, m);
  cc.evex().vexpandpd(xmmA, m);
  cc.evex().vexpandpd(ymmA, m);
  cc.evex().vexpandpd(zmmA, m);
  cc.evex().vexpandps(xmmA, m);
  cc.evex().vexpandps(ymmA, m);
  cc.evex().vexpandps(zmmA, m);
  cc.evex().vextractf32x4(m, ymmB, 0);
  cc.evex().vextractf32x4(m, zmmB, 0);
  cc.evex().vextractf32x8(m, zmmB, 0);
  cc.evex().vextractf64x2(m, ymmB, 0);
  cc.evex().vextractf64x2(m, zmmB, 0);
  cc.evex().vextractf64x4(m, zmmB, 0);
  cc.evex().vextracti32x4(m, ymmB, 0);
  cc.evex().vextracti32x4(m, zmmB, 0);
  cc.evex().vextracti32x8(m, zmmB, 0);
  cc.evex().vextracti64x2(m, ymmB, 0);
  cc.evex().vextracti64x2(m, zmmB, 0);
  cc.evex().vextracti64x4(m, zmmB, 0);
  cc.evex().vextractps(m, xmmB, 0);
  cc.evex().vfixupimmpd(xmmA, xmmB, m, 0);
  cc.evex().vfixupimmpd(ymmA, ymmB, m, 0);
  cc.evex().vfixupimmpd(zmmA, zmmB, m, 0);
  cc.evex().vfixupimmps(xmmA, xmmB, m, 0);
  cc.evex().vfixupimmps(ymmA, ymmB, m, 0);
  cc.evex().vfixupimmps(zmmA, zmmB, m, 0);
  cc.evex().vfixupimmsd(xmmA, xmmB, m, 0);
  cc.evex().vfixupimmss(xmmA, xmmB, m, 0);
  cc.evex().vfmadd132pd(xmmA, xmmB, m);
  cc.evex().vfmadd132pd(ymmA, ymmB, m);
  cc.evex().vfmadd132pd(zmmA, zmmB, m);
  cc.evex().vfmadd132ps(xmmA, xmmB, m);
  cc.evex().vfmadd132ps(ymmA, ymmB, m);
  cc.evex().vfmadd132ps(zmmA, zmmB, m);
  cc.evex().vfmadd132sd(xmmA, xmmB, m);
  cc.evex().vfmadd132ss(xmmA, xmmB, m);
  cc.evex().vfmadd213pd(xmmA, xmmB, m);
  cc.evex().vfmadd213pd(ymmA, ymmB, m);
  cc.evex().vfmadd213pd(zmmA, zmmB, m);
  cc.evex().vfmadd213ps(xmmA, xmmB, m);
  cc.evex().vfmadd213ps(ymmA, ymmB, m);
  cc.evex().vfmadd213ps(zmmA, zmmB, m);
  cc.evex().vfmadd213sd(xmmA, xmmB, m);
  cc.evex().vfmadd213ss(xmmA, xmmB, m);
  cc.evex().vfmadd231pd(xmmA, xmmB, m);
  cc.evex().vfmadd231pd(ymmA, ymmB, m);
  cc.evex().vfmadd231pd(zmmA, zmmB, m);
  cc.evex().vfmadd231ps(xmmA, xmmB, m);
  cc.evex().vfmadd231ps(ymmA, ymmB, m);
  cc.evex().vfmadd231ps(zmmA, zmmB, m);
  cc.evex().vfmadd231sd(xmmA, xmmB, m);
  cc.evex().vfmadd231ss(xmmA, xmmB, m);
  cc.evex().vfmaddsub132pd(xmmA, xmmB, m);
  cc.evex().vfmaddsub132pd(ymmA, ymmB, m);
  cc.evex().vfmaddsub132pd(zmmA, zmmB, m);
  cc.evex().vfmaddsub132ps(xmmA, xmmB, m);
  cc.evex().vfmaddsub132ps(ymmA, ymmB, m);
  cc.evex().vfmaddsub132ps(zmmA, zmmB, m);
  cc.evex().vfmaddsub213pd(xmmA, xmmB, m);
  cc.evex().vfmaddsub213pd(ymmA, ymmB, m);
  cc.evex().vfmaddsub213pd(zmmA, zmmB, m);
  cc.evex().vfmaddsub213ps(xmmA, xmmB, m);
  cc.evex().vfmaddsub213ps(ymmA, ymmB, m);
  cc.evex().vfmaddsub213ps(zmmA, zmmB, m);
  cc.evex().vfmaddsub231pd(xmmA, xmmB, m);
  cc.evex().vfmaddsub231pd(ymmA, ymmB, m);
  cc.evex().vfmaddsub231pd(zmmA, zmmB, m);
  cc.evex().vfmaddsub231ps(xmmA, xmmB, m);
  cc.evex().vfmaddsub231ps(ymmA, ymmB, m);
  cc.evex().vfmaddsub231ps(zmmA, zmmB, m);
  cc.evex().vfmsub132pd(xmmA, xmmB, m);
  cc.evex().vfmsub132pd(ymmA, ymmB, m);
  cc.evex().vfmsub132pd(zmmA, zmmB, m);
  cc.evex().vfmsub132ps(xmmA, xmmB, m);
  cc.evex().vfmsub132ps(ymmA, ymmB, m);
  cc.evex().vfmsub132ps(zmmA, zmmB, m);
  cc.evex().vfmsub132sd(xmmA, xmmB, m);
  cc.evex().vfmsub132ss(xmmA, xmmB, m);
  cc.evex().vfmsub213pd(xmmA, xmmB, m);
  cc.evex().vfmsub213pd(ymmA, ymmB, m);
  cc.evex().vfmsub213pd(zmmA, zmmB, m);
  cc.evex().vfmsub213ps(xmmA, xmmB, m);
  cc.evex().vfmsub213ps(ymmA, ymmB, m);
  cc.evex().vfmsub213ps(zmmA, zmmB, m);
  cc.evex().vfmsub213sd(xmmA, xmmB, m);
  cc.evex().vfmsub213ss(xmmA, xmmB, m);
  cc.evex().vfmsub231pd(xmmA, xmmB, m);
  cc.evex().vfmsub231pd(ymmA, ymmB, m);
  cc.evex().vfmsub231pd(zmmA, zmmB, m);
  cc.evex().vfmsub231ps(xmmA, xmmB, m);
  cc.evex().vfmsub231ps(ymmA, ymmB, m);
  cc.evex().vfmsub231ps(zmmA, zmmB, m);
  cc.evex().vfmsub231sd(xmmA, xmmB, m);
  cc.evex().vfmsub231ss(xmmA, xmmB, m);
  cc.evex().vfmsubadd132pd(xmmA, xmmB, m);
  cc.evex().vfmsubadd132pd(ymmA, ymmB, m);
  cc.evex().vfmsubadd132pd(zmmA, zmmB, m);
  cc.evex().vfmsubadd132ps(xmmA, xmmB, m);
  cc.evex().vfmsubadd132ps(ymmA, ymmB, m);
  cc.evex().vfmsubadd132ps(zmmA, zmmB, m);
  cc.evex().vfmsubadd213pd(xmmA, xmmB, m);
  cc.evex().vfmsubadd213pd(ymmA, ymmB, m);
  cc.evex().vfmsubadd213pd(zmmA, zmmB, m);
  cc.evex().vfmsubadd213ps(xmmA, xmmB, m);
  cc.evex().vfmsubadd213ps(ymmA, ymmB, m);
  cc.evex().vfmsubadd213ps(zmmA, zmmB, m);
  cc.evex().vfmsubadd231pd(xmmA, xmmB, m);
  cc.evex().vfmsubadd231pd(ymmA, ymmB, m);
  cc.evex().vfmsubadd231pd(zmmA, zmmB, m);
  cc.evex().vfmsubadd231ps(xmmA, xmmB, m);
  cc.evex().vfmsubadd231ps(ymmA, ymmB, m);
  cc.evex().vfmsubadd231ps(zmmA, zmmB, m);
  cc.evex().vfnmadd132pd(xmmA, xmmB, m);
  cc.evex().vfnmadd132pd(ymmA, ymmB, m);
  cc.evex().vfnmadd132pd(zmmA, zmmB, m);
  cc.evex().vfnmadd132ps(xmmA, xmmB, m);
  cc.evex().vfnmadd132ps(ymmA, ymmB, m);
  cc.evex().vfnmadd132ps(zmmA, zmmB, m);
  cc.evex().vfnmadd132sd(xmmA, xmmB, m);
  cc.evex().vfnmadd132ss(xmmA, xmmB, m);
  cc.evex().vfnmadd213pd(xmmA, xmmB, m);
  cc.evex().vfnmadd213pd(ymmA, ymmB, m);
  cc.evex().vfnmadd213pd(zmmA, zmmB, m);
  cc.evex().vfnmadd213ps(xmmA, xmmB, m);
  cc.evex().vfnmadd213ps(ymmA, ymmB, m);
  cc.evex().vfnmadd213ps(zmmA, zmmB, m);
  cc.evex().vfnmadd213sd(xmmA, xmmB, m);
  cc.evex().vfnmadd213ss(xmmA, xmmB, m);
  cc.evex().vfnmadd231pd(xmmA, xmmB, m);
  cc.evex().vfnmadd231pd(ymmA, ymmB, m);
  cc.evex().vfnmadd231pd(zmmA, zmmB, m);
  cc.evex().vfnmadd231ps(xmmA, xmmB, m);
  cc.evex().vfnmadd231ps(ymmA, ymmB, m);
  cc.evex().vfnmadd231ps(zmmA, zmmB, m);
  cc.evex().vfnmadd231sd(xmmA, xmmB, m);
  cc.evex().vfnmadd231ss(xmmA, xmmB, m);
  cc.evex().vfnmsub132pd(xmmA, xmmB, m);
  cc.evex().vfnmsub132pd(ymmA, ymmB, m);
  cc.evex().vfnmsub132pd(zmmA, zmmB, m);
  cc.evex().vfnmsub132ps(xmmA, xmmB, m);
  cc.evex().vfnmsub132ps(ymmA, ymmB, m);
  cc.evex().vfnmsub132ps(zmmA, zmmB, m);
  cc.evex().vfnmsub132sd(xmmA, xmmB, m);
  cc.evex().vfnmsub132ss(xmmA, xmmB, m);
  cc.evex().vfnmsub213pd(xmmA, xmmB, m);
  cc.evex().vfnmsub213pd(ymmA, ymmB, m);
  cc.evex().vfnmsub213pd(zmmA, zmmB, m);
  cc.evex().vfnmsub213ps(xmmA, xmmB, m);
  cc.evex().vfnmsub213ps(ymmA, ymmB, m);
  cc.evex().vfnmsub213ps(zmmA, zmmB, m);
  cc.evex().vfnmsub213sd(xmmA, xmmB, m);
  cc.evex().vfnmsub213ss(xmmA, xmmB, m);
  cc.evex().vfnmsub231pd(xmmA, xmmB, m);
  cc.evex().vfnmsub231pd(ymmA, ymmB, m);
  cc.evex().vfnmsub231pd(zmmA, zmmB, m);
  cc.evex().vfnmsub231ps(xmmA, xmmB, m);
  cc.evex().vfnmsub231ps(ymmA, ymmB, m);
  cc.evex().vfnmsub231ps(zmmA, zmmB, m);
  cc.evex().vfnmsub231sd(xmmA, xmmB, m);
  cc.evex().vfnmsub231ss(xmmA, xmmB, m);
  cc.evex().vfpclasspd(kA, m128, 0);
  cc.evex().vfpclasspd(kA, m256, 0);
  cc.evex().vfpclasspd(kA, m512, 0);
  cc.evex().vfpclassps(kA, m128, 0);
  cc.evex().vfpclassps(kA, m256, 0);
  cc.evex().vfpclassps(kA, m512, 0);
  cc.evex().vfpclasssd(kA, m, 0);
  cc.evex().vfpclassss(kA, m, 0);
  cc.evex().k(kA).vgatherdpd(xmmA, vx_ptr);
  cc.evex().k(kA).vgatherdpd(ymmA, vx_ptr);
  cc.evex().k(kA).vgatherdpd(zmmA, vy_ptr);
  cc.evex().k(kA).vgatherdps(xmmA, vx_ptr);
  cc.evex().k(kA).vgatherdps(ymmA, vy_ptr);
  cc.evex().k(kA).vgatherdps(zmmA, vz_ptr);
  cc.evex().k(kA).vgatherpf0dpd(vy_ptr);
  cc.evex().k(kA).vgatherpf0dps(vz_ptr);
  cc.evex().k(kA).vgatherpf0qpd(vz_ptr);
  cc.evex().k(kA).vgatherpf0qps(vz_ptr);
  cc.evex().k(kA).vgatherpf1dpd(vy_ptr);
  cc.evex().k(kA).vgatherpf1dps(vz_ptr);
  cc.evex().k(kA).vgatherpf1qpd(vz_ptr);
  cc.evex().k(kA).vgatherpf1qps(vz_ptr);
  cc.evex().k(kA).vgatherqpd(xmmA, vx_ptr);
  cc.evex().k(kA).vgatherqpd(ymmA, vy_ptr);
  cc.evex().k(kA).vgatherqpd(zmmA, vz_ptr);
  cc.evex().k(kA).vgatherqps(xmmA, vx_ptr);
  cc.evex().k(kA).vgatherqps(xmmA, vy_ptr);
  cc.evex().k(kA).vgatherqps(ymmA, vz_ptr);
  cc.evex().vgetexppd(xmmA, m);
  cc.evex().vgetexppd(ymmA, m);
  cc.evex().vgetexppd(zmmA, m);
  cc.evex().vgetexpps(xmmA, m);
  cc.evex().vgetexpps(ymmA, m);
  cc.evex().vgetexpps(zmmA, m);
  cc.evex().vgetexpsd(xmmA, xmmB, m);
  cc.evex().vgetexpss(xmmA, xmmB, m);
  cc.evex().vgetmantpd(xmmA, m, 0);
  cc.evex().vgetmantpd(ymmA, m, 0);
  cc.evex().vgetmantpd(zmmA, m, 0);
  cc.evex().vgetmantps(xmmA, m, 0);
  cc.evex().vgetmantps(ymmA, m, 0);
  cc.evex().vgetmantps(zmmA, m, 0);
  cc.evex().vgetmantsd(xmmA, xmmB, m, 0);
  cc.evex().vgetmantss(xmmA, xmmB, m, 0);
  cc.evex().vinsertf32x4(ymmA, ymmB, m, 0);
  cc.evex().vinsertf32x4(zmmA, zmmB, m, 0);
  cc.evex().vinsertf32x8(zmmA, zmmB, m, 0);
  cc.evex().vinsertf64x2(ymmA, ymmB, m, 0);
  cc.evex().vinsertf64x2(zmmA, zmmB, m, 0);
  cc.evex().vinsertf64x4(zmmA, zmmB, m, 0);
  cc.evex().vinserti32x4(ymmA, ymmB, m, 0);
  cc.evex().vinserti32x4(zmmA, zmmB, m, 0);
  cc.evex().vinserti32x8(zmmA, zmmB, m, 0);
  cc.evex().vinserti64x2(ymmA, ymmB, m, 0);
  cc.evex().vinserti64x2(zmmA, zmmB, m, 0);
  cc.evex().vinserti64x4(zmmA, zmmB, m, 0);
  cc.evex().vinsertps(xmmA, xmmB, m, 0);
  cc.evex().vmaxpd(xmmA, xmmB, m);
  cc.evex().vmaxpd(ymmA, ymmB, m);
  cc.evex().vmaxpd(zmmA, zmmB, m);
  cc.evex().vmaxps(xmmA, xmmB, m);
  cc.evex().vmaxps(ymmA, ymmB, m);
  cc.evex().vmaxps(zmmA, zmmB, m);
  cc.evex().vmaxsd(xmmA, xmmB, m);
  cc.evex().vmaxss(xmmA, xmmB, m);
  cc.evex().vminpd(xmmA, xmmB, m);
  cc.evex().vminpd(ymmA, ymmB, m);
  cc.evex().vminpd(zmmA, zmmB, m);
  cc.evex().vminps(xmmA, xmmB, m);
  cc.evex().vminps(ymmA, ymmB, m);
  cc.evex().vminps(zmmA, zmmB, m);
  cc.evex().vminsd(xmmA, xmmB, m);
  cc.evex().vminss(xmmA, xmmB, m);
  cc.evex().vmovapd(xmmA, m);
  cc.evex().vmovapd(m, xmmB);
  cc.evex().vmovapd(ymmA, m);
  cc.evex().vmovapd(m, ymmB);
  cc.evex().vmovapd(zmmA, m);
  cc.evex().vmovapd(m, zmmB);
  cc.evex().vmovaps(xmmA, m);
  cc.evex().vmovaps(m, xmmB);
  cc.evex().vmovaps(ymmA, m);
  cc.evex().vmovaps(m, ymmB);
  cc.evex().vmovaps(zmmA, m);
  cc.evex().vmovaps(m, zmmB);
  cc.evex().vmovd(m, xmmB);
  cc.evex().vmovd(xmmA, m);
  cc.evex().vmovddup(xmmA, m);
  cc.evex().vmovddup(ymmA, m);
  cc.evex().vmovddup(zmmA, m);
  cc.evex().vmovdqa32(xmmA, m);
  cc.evex().vmovdqa32(m, xmmB);
  cc.evex().vmovdqa32(ymmA, m);
  cc.evex().vmovdqa32(m, ymmB);
  cc.evex().vmovdqa32(zmmA, m);
  cc.evex().vmovdqa32(m, zmmB);
  cc.evex().vmovdqa64(xmmA, m);
  cc.evex().vmovdqa64(m, xmmB);
  cc.evex().vmovdqa64(ymmA, m);
  cc.evex().vmovdqa64(m, ymmB);
  cc.evex().vmovdqa64(zmmA, m);
  cc.evex().vmovdqa64(m, zmmB);
  cc.evex().vmovdqu16(xmmA, m);
  cc.evex().vmovdqu16(m, xmmB);
  cc.evex().vmovdqu16(ymmA, m);
  cc.evex().vmovdqu16(m, ymmB);
  cc.evex().vmovdqu16(zmmA, m);
  cc.evex().vmovdqu16(m, zmmB);
  cc.evex().vmovdqu32(xmmA, m);
  cc.evex().vmovdqu32(m, xmmB);
  cc.evex().vmovdqu32(ymmA, m);
  cc.evex().vmovdqu32(m, ymmB);
  cc.evex().vmovdqu32(zmmA, m);
  cc.evex().vmovdqu32(m, zmmB);
  cc.evex().vmovdqu64(xmmA, m);
  cc.evex().vmovdqu64(m, xmmB);
  cc.evex().vmovdqu64(ymmA, m);
  cc.evex().vmovdqu64(m, ymmB);
  cc.evex().vmovdqu64(zmmA, m);
  cc.evex().vmovdqu64(m, zmmB);
  cc.evex().vmovdqu8(xmmA, m);
  cc.evex().vmovdqu8(m, xmmB);
  cc.evex().vmovdqu8(ymmA, m);
  cc.evex().vmovdqu8(m, ymmB);
  cc.evex().vmovdqu8(zmmA, m);
  cc.evex().vmovdqu8(m, zmmB);
  cc.evex().vmovhpd(m, xmmB);
  cc.evex().vmovhpd(xmmA, xmmB, m);
  cc.evex().vmovhps(m, xmmB);
  cc.evex().vmovhps(xmmA, xmmB, m);
  cc.evex().vmovlpd(m, xmmB);
  cc.evex().vmovlpd(xmmA, xmmB, m);
  cc.evex().vmovlps(m, xmmB);
  cc.evex().vmovlps(xmmA, xmmB, m);
  cc.evex().vmovntdq(m, xmmB);
  cc.evex().vmovntdq(m, ymmB);
  cc.evex().vmovntdq(m, zmmB);
  cc.evex().vmovntdqa(xmmA, m);
  cc.evex().vmovntdqa(ymmA, m);
  cc.evex().vmovntdqa(zmmA, m);
  cc.evex().vmovntpd(m, xmmB);
  cc.evex().vmovntpd(m, ymmB);
  cc.evex().vmovntpd(m, zmmB);
  cc.evex().vmovntps(m, xmmB);
  cc.evex().vmovntps(m, ymmB);
  cc.evex().vmovntps(m, zmmB);
  cc.evex().vmovq(m, xmmB);
  cc.evex().vmovq(xmmA, m);
  cc.evex().vmovq(xmmA, m);
  cc.evex().vmovq(m, xmmB);
  cc.evex().vmovsd(m, xmmB);
  cc.evex().vmovsd(xmmA, m);
  cc.evex().vmovshdup(xmmA, m);
  cc.evex().vmovshdup(ymmA, m);
  cc.evex().vmovshdup(zmmA, m);
  cc.evex().vmovsldup(xmmA, m);
  cc.evex().vmovsldup(ymmA, m);
  cc.evex().vmovsldup(zmmA, m);
  cc.evex().vmovss(m, xmmB);
  cc.evex().vmovss(xmmA, m);
  cc.evex().vmovupd(xmmA, m);
  cc.evex().vmovupd(m, xmmB);
  cc.evex().vmovupd(ymmA, m);
  cc.evex().vmovupd(m, ymmB);
  cc.evex().vmovupd(zmmA, m);
  cc.evex().vmovupd(m, zmmB);
  cc.evex().vmovups(xmmA, m);
  cc.evex().vmovups(m, xmmB);
  cc.evex().vmovups(ymmA, m);
  cc.evex().vmovups(m, ymmB);
  cc.evex().vmovups(zmmA, m);
  cc.evex().vmovups(m, zmmB);
  cc.evex().vmulpd(xmmA, xmmB, m);
  cc.evex().vmulpd(ymmA, ymmB, m);
  cc.evex().vmulpd(zmmA, zmmB, m);
  cc.evex().vmulps(xmmA, xmmB, m);
  cc.evex().vmulps(ymmA, ymmB, m);
  cc.evex().vmulps(zmmA, zmmB, m);
  cc.evex().vmulsd(xmmA, xmmB, m);
  cc.evex().vmulss(xmmA, xmmB, m);
  cc.evex().vorpd(xmmA, xmmB, m);
  cc.evex().vorpd(ymmA, ymmB, m);
  cc.evex().vorpd(zmmA, zmmB, m);
  cc.evex().vorps(xmmA, xmmB, m);
  cc.evex().vorps(ymmA, ymmB, m);
  cc.evex().vorps(zmmA, zmmB, m);
  cc.evex().vpabsb(xmmA, m);
  cc.evex().vpabsb(ymmA, m);
  cc.evex().vpabsb(zmmA, m);
  cc.evex().vpabsd(xmmA, m);
  cc.evex().vpabsd(ymmA, m);
  cc.evex().vpabsd(zmmA, m);
  cc.evex().vpabsq(xmmA, m);
  cc.evex().vpabsq(ymmA, m);
  cc.evex().vpabsq(zmmA, m);
  cc.evex().vpabsw(xmmA, m);
  cc.evex().vpabsw(ymmA, m);
  cc.evex().vpabsw(zmmA, m);
  cc.evex().vpackssdw(xmmA, xmmB, m);
  cc.evex().vpackssdw(ymmA, ymmB, m);
  cc.evex().vpackssdw(zmmA, zmmB, m);
  cc.evex().vpacksswb(xmmA, xmmB, m);
  cc.evex().vpacksswb(ymmA, ymmB, m);
  cc.evex().vpacksswb(zmmA, zmmB, m);
  cc.evex().vpackusdw(xmmA, xmmB, m);
  cc.evex().vpackusdw(ymmA, ymmB, m);
  cc.evex().vpackusdw(zmmA, zmmB, m);
  cc.evex().vpackuswb(xmmA, xmmB, m);
  cc.evex().vpackuswb(ymmA, ymmB, m);
  cc.evex().vpackuswb(zmmA, zmmB, m);
  cc.evex().vpaddb(xmmA, xmmB, m);
  cc.evex().vpaddb(ymmA, ymmB, m);
  cc.evex().vpaddb(zmmA, zmmB, m);
  cc.evex().vpaddd(xmmA, xmmB, m);
  cc.evex().vpaddd(ymmA, ymmB, m);
  cc.evex().vpaddd(zmmA, zmmB, m);
  cc.evex().vpaddq(xmmA, xmmB, m);
  cc.evex().vpaddq(ymmA, ymmB, m);
  cc.evex().vpaddq(zmmA, zmmB, m);
  cc.evex().vpaddsb(xmmA, xmmB, m);
  cc.evex().vpaddsb(ymmA, ymmB, m);
  cc.evex().vpaddsb(zmmA, zmmB, m);
  cc.evex().vpaddsw(xmmA, xmmB, m);
  cc.evex().vpaddsw(ymmA, ymmB, m);
  cc.evex().vpaddsw(zmmA, zmmB, m);
  cc.evex().vpaddusb(xmmA, xmmB, m);
  cc.evex().vpaddusb(ymmA, ymmB, m);
  cc.evex().vpaddusb(zmmA, zmmB, m);
  cc.evex().vpaddusw(xmmA, xmmB, m);
  cc.evex().vpaddusw(ymmA, ymmB, m);
  cc.evex().vpaddusw(zmmA, zmmB, m);
  cc.evex().vpaddw(xmmA, xmmB, m);
  cc.evex().vpaddw(ymmA, ymmB, m);
  cc.evex().vpaddw(zmmA, zmmB, m);
  cc.evex().vpalignr(xmmA, xmmB, m, 0);
  cc.evex().vpalignr(ymmA, ymmB, m, 0);
  cc.evex().vpalignr(zmmA, zmmB, m, 0);
  cc.evex().vpandd(xmmA, xmmB, m);
  cc.evex().vpandd(ymmA, ymmB, m);
  cc.evex().vpandd(zmmA, zmmB, m);
  cc.evex().vpandnd(xmmA, xmmB, m);
  cc.evex().vpandnd(ymmA, ymmB, m);
  cc.evex().vpandnd(zmmA, zmmB, m);
  cc.evex().vpandnq(xmmA, xmmB, m);
  cc.evex().vpandnq(ymmA, ymmB, m);
  cc.evex().vpandnq(zmmA, zmmB, m);
  cc.evex().vpandq(xmmA, xmmB, m);
  cc.evex().vpandq(ymmA, ymmB, m);
  cc.evex().vpandq(zmmA, zmmB, m);
  cc.evex().vpavgb(xmmA, xmmB, m);
  cc.evex().vpavgb(ymmA, ymmB, m);
  cc.evex().vpavgb(zmmA, zmmB, m);
  cc.evex().vpavgw(xmmA, xmmB, m);
  cc.evex().vpavgw(ymmA, ymmB, m);
  cc.evex().vpavgw(zmmA, zmmB, m);
  cc.evex().vpblendmb(xmmA, xmmB, m);
  cc.evex().vpblendmb(ymmA, ymmB, m);
  cc.evex().vpblendmb(zmmA, zmmB, m);
  cc.evex().vpblendmd(xmmA, xmmB, m);
  cc.evex().vpblendmd(ymmA, ymmB, m);
  cc.evex().vpblendmd(zmmA, zmmB, m);
  cc.evex().vpblendmq(xmmA, xmmB, m);
  cc.evex().vpblendmq(ymmA, ymmB, m);
  cc.evex().vpblendmq(zmmA, zmmB, m);
  cc.evex().vpblendmw(xmmA, xmmB, m);
  cc.evex().vpblendmw(ymmA, ymmB, m);
  cc.evex().vpblendmw(zmmA, zmmB, m);
  cc.evex().vpbroadcastb(xmmA, m);
  cc.evex().vpbroadcastb(ymmA, m);
  cc.evex().vpbroadcastb(zmmA, m);
  cc.evex().vpbroadcastd(xmmA, m);
  cc.evex().vpbroadcastd(ymmA, m);
  cc.evex().vpbroadcastd(zmmA, m);
  cc.evex().vpbroadcastq(xmmA, m);
  cc.evex().vpbroadcastq(ymmA, m);
  cc.evex().vpbroadcastq(zmmA, m);
  cc.evex().vpbroadcastw(xmmA, m);
  cc.evex().vpbroadcastw(ymmA, m);
  cc.evex().vpbroadcastw(zmmA, m);
  cc.evex().vpcmpb(kA, xmmB, m, 0);
  cc.evex().vpcmpb(kA, ymmB, m, 0);
  cc.evex().vpcmpb(kA, zmmB, m, 0);
  cc.evex().vpcmpd(kA, xmmB, m, 0);
  cc.evex().vpcmpd(kA, ymmB, m, 0);
  cc.evex().vpcmpd(kA, zmmB, m, 0);
  cc.evex().vpcmpeqb(kA, xmmB, m);
  cc.evex().vpcmpeqb(kA, ymmB, m);
  cc.evex().vpcmpeqb(kA, zmmB, m);
  cc.evex().vpcmpeqd(kA, xmmB, m);
  cc.evex().vpcmpeqd(kA, ymmB, m);
  cc.evex().vpcmpeqd(kA, zmmB, m);
  cc.evex().vpcmpeqq(kA, xmmB, m);
  cc.evex().vpcmpeqq(kA, ymmB, m);
  cc.evex().vpcmpeqq(kA, zmmB, m);
  cc.evex().vpcmpeqw(kA, xmmB, m);
  cc.evex().vpcmpeqw(kA, ymmB, m);
  cc.evex().vpcmpeqw(kA, zmmB, m);
  cc.evex().vpcmpgtb(kA, xmmB, m);
  cc.evex().vpcmpgtb(kA, ymmB, m);
  cc.evex().vpcmpgtb(kA, zmmB, m);
  cc.evex().vpcmpgtd(kA, xmmB, m);
  cc.evex().vpcmpgtd(kA, ymmB, m);
  cc.evex().vpcmpgtd(kA, zmmB, m);
  cc.evex().vpcmpgtq(kA, xmmB, m);
  cc.evex().vpcmpgtq(kA, ymmB, m);
  cc.evex().vpcmpgtq(kA, zmmB, m);
  cc.evex().vpcmpgtw(kA, xmmB, m);
  cc.evex().vpcmpgtw(kA, ymmB, m);
  cc.evex().vpcmpgtw(kA, zmmB, m);
  cc.evex().vpcmpq(kA, xmmB, m, 0);
  cc.evex().vpcmpq(kA, ymmB, m, 0);
  cc.evex().vpcmpq(kA, zmmB, m, 0);
  cc.evex().vpcmpub(kA, xmmB, m, 0);
  cc.evex().vpcmpub(kA, ymmB, m, 0);
  cc.evex().vpcmpub(kA, zmmB, m, 0);
  cc.evex().vpcmpud(kA, xmmB, m, 0);
  cc.evex().vpcmpud(kA, ymmB, m, 0);
  cc.evex().vpcmpud(kA, zmmB, m, 0);
  cc.evex().vpcmpuq(kA, xmmB, m, 0);
  cc.evex().vpcmpuq(kA, ymmB, m, 0);
  cc.evex().vpcmpuq(kA, zmmB, m, 0);
  cc.evex().vpcmpuw(kA, xmmB, m, 0);
  cc.evex().vpcmpuw(kA, ymmB, m, 0);
  cc.evex().vpcmpuw(kA, zmmB, m, 0);
  cc.evex().vpcmpw(kA, xmmB, m, 0);
  cc.evex().vpcmpw(kA, ymmB, m, 0);
  cc.evex().vpcmpw(kA, zmmB, m, 0);
  cc.evex().vpcompressd(m, xmmB);
  cc.evex().vpcompressd(m, ymmB);
  cc.evex().vpcompressd(m, zmmB);
  cc.evex().vpcompressq(m, xmmB);
  cc.evex().vpcompressq(m, ymmB);
  cc.evex().vpcompressq(m, zmmB);
  cc.evex().vpconflictd(xmmA, m);
  cc.evex().vpconflictd(ymmA, m);
  cc.evex().vpconflictd(zmmA, m);
  cc.evex().vpconflictq(xmmA, m);
  cc.evex().vpconflictq(ymmA, m);
  cc.evex().vpconflictq(zmmA, m);
  cc.evex().vpermb(xmmA, xmmB, m);
  cc.evex().vpermb(ymmA, ymmB, m);
  cc.evex().vpermb(zmmA, zmmB, m);
  cc.evex().vpermd(ymmA, ymmB, m);
  cc.evex().vpermd(zmmA, zmmB, m);
  cc.evex().vpermi2b(xmmA, xmmB, m);
  cc.evex().vpermi2b(ymmA, ymmB, m);
  cc.evex().vpermi2b(zmmA, zmmB, m);
  cc.evex().vpermi2d(xmmA, xmmB, m);
  cc.evex().vpermi2d(ymmA, ymmB, m);
  cc.evex().vpermi2d(zmmA, zmmB, m);
  cc.evex().vpermi2pd(xmmA, xmmB, m);
  cc.evex().vpermi2pd(ymmA, ymmB, m);
  cc.evex().vpermi2pd(zmmA, zmmB, m);
  cc.evex().vpermi2ps(xmmA, xmmB, m);
  cc.evex().vpermi2ps(ymmA, ymmB, m);
  cc.evex().vpermi2ps(zmmA, zmmB, m);
  cc.evex().vpermi2q(xmmA, xmmB, m);
  cc.evex().vpermi2q(ymmA, ymmB, m);
  cc.evex().vpermi2q(zmmA, zmmB, m);
  cc.evex().vpermi2w(xmmA, xmmB, m);
  cc.evex().vpermi2w(ymmA, ymmB, m);
  cc.evex().vpermi2w(zmmA, zmmB, m);
  cc.evex().vpermilpd(xmmA, xmmB, m);
  cc.evex().vpermilpd(ymmA, ymmB, m);
  cc.evex().vpermilpd(zmmA, zmmB, m);
  cc.evex().vpermilpd(xmmA, m, 0);
  cc.evex().vpermilpd(ymmA, m, 0);
  cc.evex().vpermilpd(zmmA, m, 0);
  cc.evex().vpermilps(xmmA, xmmB, m);
  cc.evex().vpermilps(ymmA, ymmB, m);
  cc.evex().vpermilps(zmmA, zmmB, m);
  cc.evex().vpermilps(xmmA, m, 0);
  cc.evex().vpermilps(ymmA, m, 0);
  cc.evex().vpermilps(zmmA, m, 0);
  cc.evex().vpermq(ymmA, ymmB, m);
  cc.evex().vpermq(zmmA, zmmB, m);
  cc.evex().vpermq(ymmA, m, 0);
  cc.evex().vpermq(zmmA, m, 0);
  cc.evex().vpermt2b(xmmA, xmmB, m);
  cc.evex().vpermt2b(ymmA, ymmB, m);
  cc.evex().vpermt2b(zmmA, zmmB, m);
  cc.evex().vpermt2d(xmmA, xmmB, m);
  cc.evex().vpermt2d(ymmA, ymmB, m);
  cc.evex().vpermt2d(zmmA, zmmB, m);
  cc.evex().vpermt2pd(xmmA, xmmB, m);
  cc.evex().vpermt2pd(ymmA, ymmB, m);
  cc.evex().vpermt2pd(zmmA, zmmB, m);
  cc.evex().vpermt2ps(xmmA, xmmB, m);
  cc.evex().vpermt2ps(ymmA, ymmB, m);
  cc.evex().vpermt2ps(zmmA, zmmB, m);
  cc.evex().vpermt2q(xmmA, xmmB, m);
  cc.evex().vpermt2q(ymmA, ymmB, m);
  cc.evex().vpermt2q(zmmA, zmmB, m);
  cc.evex().vpermt2w(xmmA, xmmB, m);
  cc.evex().vpermt2w(ymmA, ymmB, m);
  cc.evex().vpermt2w(zmmA, zmmB, m);
  cc.evex().vpermw(xmmA, xmmB, m);
  cc.evex().vpermw(ymmA, ymmB, m);
  cc.evex().vpermw(zmmA, zmmB, m);
  cc.evex().vpexpandd(xmmA, m);
  cc.evex().vpexpandd(ymmA, m);
  cc.evex().vpexpandd(zmmA, m);
  cc.evex().vpexpandq(xmmA, m);
  cc.evex().vpexpandq(ymmA, m);
  cc.evex().vpexpandq(zmmA, m);
  cc.evex().vpextrb(m, xmmB, 0);
  cc.evex().vpextrd(m, xmmB, 0);
  if (cc.is64Bit()) cc.evex().vpextrq(m, xmmB, 0);
  cc.evex().vpextrw(m, xmmB, 0);
  cc.evex().k(kA).vpgatherdd(xmmA, vx_ptr);
  cc.evex().k(kA).vpgatherdd(ymmA, vy_ptr);
  cc.evex().k(kA).vpgatherdd(zmmA, vz_ptr);
  cc.evex().k(kA).vpgatherdq(xmmA, vx_ptr);
  cc.evex().k(kA).vpgatherdq(ymmA, vx_ptr);
  cc.evex().k(kA).vpgatherdq(zmmA, vy_ptr);
  cc.evex().k(kA).vpgatherqd(xmmA, vx_ptr);
  cc.evex().k(kA).vpgatherqd(xmmA, vy_ptr);
  cc.evex().k(kA).vpgatherqd(ymmA, vz_ptr);
  cc.evex().k(kA).vpgatherqq(xmmA, vx_ptr);
  cc.evex().k(kA).vpgatherqq(ymmA, vy_ptr);
  cc.evex().k(kA).vpgatherqq(zmmA, vz_ptr);
  cc.evex().vpinsrb(xmmA, xmmB, m, 0);
  cc.evex().vpinsrd(xmmA, xmmB, m, 0);
  if (cc.is64Bit()) cc.evex().vpinsrq(xmmA, xmmB, m, 0);
  cc.evex().vpinsrw(xmmA, xmmB, m, 0);
  cc.evex().vplzcntd(xmmA, m);
  cc.evex().vplzcntd(ymmA, m);
  cc.evex().vplzcntd(zmmA, m);
  cc.evex().vplzcntq(xmmA, m);
  cc.evex().vplzcntq(ymmA, m);
  cc.evex().vplzcntq(zmmA, m);
  cc.evex().vpmadd52huq(xmmA, xmmB, m);
  cc.evex().vpmadd52huq(ymmA, ymmB, m);
  cc.evex().vpmadd52huq(zmmA, zmmB, m);
  cc.evex().vpmadd52luq(xmmA, xmmB, m);
  cc.evex().vpmadd52luq(ymmA, ymmB, m);
  cc.evex().vpmadd52luq(zmmA, zmmB, m);
  cc.evex().vpmaddubsw(xmmA, xmmB, m);
  cc.evex().vpmaddubsw(ymmA, ymmB, m);
  cc.evex().vpmaddubsw(zmmA, zmmB, m);
  cc.evex().vpmaddwd(xmmA, xmmB, m);
  cc.evex().vpmaddwd(ymmA, ymmB, m);
  cc.evex().vpmaddwd(zmmA, zmmB, m);
  cc.evex().vpmaxsb(xmmA, xmmB, m);
  cc.evex().vpmaxsb(ymmA, ymmB, m);
  cc.evex().vpmaxsb(zmmA, zmmB, m);
  cc.evex().vpmaxsd(xmmA, xmmB, m);
  cc.evex().vpmaxsd(ymmA, ymmB, m);
  cc.evex().vpmaxsd(zmmA, zmmB, m);
  cc.evex().vpmaxsq(xmmA, xmmB, m);
  cc.evex().vpmaxsq(ymmA, ymmB, m);
  cc.evex().vpmaxsq(zmmA, zmmB, m);
  cc.evex().vpmaxsw(xmmA, xmmB, m);
  cc.evex().vpmaxsw(ymmA, ymmB, m);
  cc.evex().vpmaxsw(zmmA, zmmB, m);
  cc.evex().vpmaxub(xmmA, xmmB, m);
  cc.evex().vpmaxub(ymmA, ymmB, m);
  cc.evex().vpmaxub(zmmA, zmmB, m);
  cc.evex().vpmaxud(xmmA, xmmB, m);
  cc.evex().vpmaxud(ymmA, ymmB, m);
  cc.evex().vpmaxud(zmmA, zmmB, m);
  cc.evex().vpmaxuq(xmmA, xmmB, m);
  cc.evex().vpmaxuq(ymmA, ymmB, m);
  cc.evex().vpmaxuq(zmmA, zmmB, m);
  cc.evex().vpmaxuw(xmmA, xmmB, m);
  cc.evex().vpmaxuw(ymmA, ymmB, m);
  cc.evex().vpmaxuw(zmmA, zmmB, m);
  cc.evex().vpminsb(xmmA, xmmB, m);
  cc.evex().vpminsb(ymmA, ymmB, m);
  cc.evex().vpminsb(zmmA, zmmB, m);
  cc.evex().vpminsd(xmmA, xmmB, m);
  cc.evex().vpminsd(ymmA, ymmB, m);
  cc.evex().vpminsd(zmmA, zmmB, m);
  cc.evex().vpminsq(xmmA, xmmB, m);
  cc.evex().vpminsq(ymmA, ymmB, m);
  cc.evex().vpminsq(zmmA, zmmB, m);
  cc.evex().vpminsw(xmmA, xmmB, m);
  cc.evex().vpminsw(ymmA, ymmB, m);
  cc.evex().vpminsw(zmmA, zmmB, m);
  cc.evex().vpminub(xmmA, xmmB, m);
  cc.evex().vpminub(ymmA, ymmB, m);
  cc.evex().vpminub(zmmA, zmmB, m);
  cc.evex().vpminud(xmmA, xmmB, m);
  cc.evex().vpminud(ymmA, ymmB, m);
  cc.evex().vpminud(zmmA, zmmB, m);
  cc.evex().vpminuq(xmmA, xmmB, m);
  cc.evex().vpminuq(ymmA, ymmB, m);
  cc.evex().vpminuq(zmmA, zmmB, m);
  cc.evex().vpminuw(xmmA, xmmB, m);
  cc.evex().vpminuw(ymmA, ymmB, m);
  cc.evex().vpminuw(zmmA, zmmB, m);
  cc.evex().vpmovdb(m, xmmB);
  cc.evex().vpmovdb(m, ymmB);
  cc.evex().vpmovdb(m, zmmB);
  cc.evex().vpmovdw(m, xmmB);
  cc.evex().vpmovdw(m, ymmB);
  cc.evex().vpmovdw(m, zmmB);
  cc.evex().vpmovqb(m, xmmB);
  cc.evex().vpmovqb(m, ymmB);
  cc.evex().vpmovqb(m, zmmB);
  cc.evex().vpmovqd(m, xmmB);
  cc.evex().vpmovqd(m, ymmB);
  cc.evex().vpmovqd(m, zmmB);
  cc.evex().vpmovqw(m, xmmB);
  cc.evex().vpmovqw(m, ymmB);
  cc.evex().vpmovqw(m, zmmB);
  cc.evex().vpmovsdb(m, xmmB);
  cc.evex().vpmovsdb(m, ymmB);
  cc.evex().vpmovsdb(m, zmmB);
  cc.evex().vpmovsdw(m, xmmB);
  cc.evex().vpmovsdw(m, ymmB);
  cc.evex().vpmovsdw(m, zmmB);
  cc.evex().vpmovsqb(m, xmmB);
  cc.evex().vpmovsqb(m, ymmB);
  cc.evex().vpmovsqb(m, zmmB);
  cc.evex().vpmovsqd(m, xmmB);
  cc.evex().vpmovsqd(m, ymmB);
  cc.evex().vpmovsqd(m, zmmB);
  cc.evex().vpmovsqw(m, xmmB);
  cc.evex().vpmovsqw(m, ymmB);
  cc.evex().vpmovsqw(m, zmmB);
  cc.evex().vpmovswb(m, xmmB);
  cc.evex().vpmovswb(m, ymmB);
  cc.evex().vpmovswb(m, zmmB);
  cc.evex().vpmovsxbd(xmmA, m);
  cc.evex().vpmovsxbd(ymmA, m);
  cc.evex().vpmovsxbd(zmmA, m);
  cc.evex().vpmovsxbq(xmmA, m);
  cc.evex().vpmovsxbq(ymmA, m);
  cc.evex().vpmovsxbq(zmmA, m);
  cc.evex().vpmovsxbw(xmmA, m);
  cc.evex().vpmovsxbw(ymmA, m);
  cc.evex().vpmovsxbw(zmmA, m);
  cc.evex().vpmovsxdq(xmmA, m);
  cc.evex().vpmovsxdq(ymmA, m);
  cc.evex().vpmovsxdq(zmmA, m);
  cc.evex().vpmovsxwd(xmmA, m);
  cc.evex().vpmovsxwd(ymmA, m);
  cc.evex().vpmovsxwd(zmmA, m);
  cc.evex().vpmovsxwq(xmmA, m);
  cc.evex().vpmovsxwq(ymmA, m);
  cc.evex().vpmovsxwq(zmmA, m);
  cc.evex().vpmovusdb(m, xmmB);
  cc.evex().vpmovusdb(m, ymmB);
  cc.evex().vpmovusdb(m, zmmB);
  cc.evex().vpmovusdw(m, xmmB);
  cc.evex().vpmovusdw(m, ymmB);
  cc.evex().vpmovusdw(m, zmmB);
  cc.evex().vpmovusqb(m, xmmB);
  cc.evex().vpmovusqb(m, ymmB);
  cc.evex().vpmovusqb(m, zmmB);
  cc.evex().vpmovusqd(m, xmmB);
  cc.evex().vpmovusqd(m, ymmB);
  cc.evex().vpmovusqd(m, zmmB);
  cc.evex().vpmovusqw(m, xmmB);
  cc.evex().vpmovusqw(m, ymmB);
  cc.evex().vpmovusqw(m, zmmB);
  cc.evex().vpmovuswb(m, xmmB);
  cc.evex().vpmovuswb(m, ymmB);
  cc.evex().vpmovuswb(m, zmmB);
  cc.evex().vpmovwb(m, xmmB);
  cc.evex().vpmovwb(m, ymmB);
  cc.evex().vpmovwb(m, zmmB);
  cc.evex().vpmovzxbd(xmmA, m);
  cc.evex().vpmovzxbd(ymmA, m);
  cc.evex().vpmovzxbd(zmmA, m);
  cc.evex().vpmovzxbq(xmmA, m);
  cc.evex().vpmovzxbq(ymmA, m);
  cc.evex().vpmovzxbq(zmmA, m);
  cc.evex().vpmovzxbw(xmmA, m);
  cc.evex().vpmovzxbw(ymmA, m);
  cc.evex().vpmovzxbw(zmmA, m);
  cc.evex().vpmovzxdq(xmmA, m);
  cc.evex().vpmovzxdq(ymmA, m);
  cc.evex().vpmovzxdq(zmmA, m);
  cc.evex().vpmovzxwd(xmmA, m);
  cc.evex().vpmovzxwd(ymmA, m);
  cc.evex().vpmovzxwd(zmmA, m);
  cc.evex().vpmovzxwq(xmmA, m);
  cc.evex().vpmovzxwq(ymmA, m);
  cc.evex().vpmovzxwq(zmmA, m);
  cc.evex().vpmuldq(xmmA, xmmB, m);
  cc.evex().vpmuldq(ymmA, ymmB, m);
  cc.evex().vpmuldq(zmmA, zmmB, m);
  cc.evex().vpmulhrsw(xmmA, xmmB, m);
  cc.evex().vpmulhrsw(ymmA, ymmB, m);
  cc.evex().vpmulhrsw(zmmA, zmmB, m);
  cc.evex().vpmulhuw(xmmA, xmmB, m);
  cc.evex().vpmulhuw(ymmA, ymmB, m);
  cc.evex().vpmulhuw(zmmA, zmmB, m);
  cc.evex().vpmulhw(xmmA, xmmB, m);
  cc.evex().vpmulhw(ymmA, ymmB, m);
  cc.evex().vpmulhw(zmmA, zmmB, m);
  cc.evex().vpmulld(xmmA, xmmB, m);
  cc.evex().vpmulld(ymmA, ymmB, m);
  cc.evex().vpmulld(zmmA, zmmB, m);
  cc.evex().vpmullq(xmmA, xmmB, m);
  cc.evex().vpmullq(ymmA, ymmB, m);
  cc.evex().vpmullq(zmmA, zmmB, m);
  cc.evex().vpmullw(xmmA, xmmB, m);
  cc.evex().vpmullw(ymmA, ymmB, m);
  cc.evex().vpmullw(zmmA, zmmB, m);
  cc.evex().vpmultishiftqb(xmmA, xmmB, m);
  cc.evex().vpmultishiftqb(ymmA, ymmB, m);
  cc.evex().vpmultishiftqb(zmmA, zmmB, m);
  cc.evex().vpmuludq(xmmA, xmmB, m);
  cc.evex().vpmuludq(ymmA, ymmB, m);
  cc.evex().vpmuludq(zmmA, zmmB, m);
  cc.evex().vpopcntd(zmmA, m);
  cc.evex().vpopcntq(zmmA, m);
  cc.evex().vpord(xmmA, xmmB, m);
  cc.evex().vpord(ymmA, ymmB, m);
  cc.evex().vpord(zmmA, zmmB, m);
  cc.evex().vporq(xmmA, xmmB, m);
  cc.evex().vporq(ymmA, ymmB, m);
  cc.evex().vporq(zmmA, zmmB, m);
  cc.evex().vprold(xmmA, m, 0);
  cc.evex().vprold(ymmA, m, 0);
  cc.evex().vprold(zmmA, m, 0);
  cc.evex().vprolq(xmmA, m, 0);
  cc.evex().vprolq(ymmA, m, 0);
  cc.evex().vprolq(zmmA, m, 0);
  cc.evex().vprolvd(xmmA, xmmB, m);
  cc.evex().vprolvd(ymmA, ymmB, m);
  cc.evex().vprolvd(zmmA, zmmB, m);
  cc.evex().vprolvq(xmmA, xmmB, m);
  cc.evex().vprolvq(ymmA, ymmB, m);
  cc.evex().vprolvq(zmmA, zmmB, m);
  cc.evex().vprord(xmmA, m, 0);
  cc.evex().vprord(ymmA, m, 0);
  cc.evex().vprord(zmmA, m, 0);
  cc.evex().vprorq(xmmA, m, 0);
  cc.evex().vprorq(ymmA, m, 0);
  cc.evex().vprorq(zmmA, m, 0);
  cc.evex().vprorvd(xmmA, xmmB, m);
  cc.evex().vprorvd(ymmA, ymmB, m);
  cc.evex().vprorvd(zmmA, zmmB, m);
  cc.evex().vprorvq(xmmA, xmmB, m);
  cc.evex().vprorvq(ymmA, ymmB, m);
  cc.evex().vprorvq(zmmA, zmmB, m);
  cc.evex().vpsadbw(xmmA, xmmB, m);
  cc.evex().vpsadbw(ymmA, ymmB, m);
  cc.evex().vpsadbw(zmmA, zmmB, m);
  cc.evex().k(kA).vpscatterdd(vx_ptr, xmmB);
  cc.evex().k(kA).vpscatterdd(vy_ptr, ymmB);
  cc.evex().k(kA).vpscatterdd(vz_ptr, zmmB);
  cc.evex().k(kA).vpscatterdq(vx_ptr, xmmB);
  cc.evex().k(kA).vpscatterdq(vx_ptr, ymmB);
  cc.evex().k(kA).vpscatterdq(vy_ptr, zmmB);
  cc.evex().k(kA).vpscatterqd(vx_ptr, xmmB);
  cc.evex().k(kA).vpscatterqd(vy_ptr, xmmB);
  cc.evex().k(kA).vpscatterqd(vz_ptr, ymmB);
  cc.evex().k(kA).vpscatterqq(vx_ptr, xmmB);
  cc.evex().k(kA).vpscatterqq(vy_ptr, ymmB);
  cc.evex().k(kA).vpscatterqq(vz_ptr, zmmB);
  cc.evex().vpshufb(xmmA, xmmB, m);
  cc.evex().vpshufb(ymmA, ymmB, m);
  cc.evex().vpshufb(zmmA, zmmB, m);
  cc.evex().vpshufd(xmmA, m, 0);
  cc.evex().vpshufd(ymmA, m, 0);
  cc.evex().vpshufd(zmmA, m, 0);
  cc.evex().vpshufhw(xmmA, m, 0);
  cc.evex().vpshufhw(ymmA, m, 0);
  cc.evex().vpshufhw(zmmA, m, 0);
  cc.evex().vpshuflw(xmmA, m, 0);
  cc.evex().vpshuflw(ymmA, m, 0);
  cc.evex().vpshuflw(zmmA, m, 0);
  cc.evex().vpslld(xmmA, xmmB, m);
  cc.evex().vpslld(xmmA, m, 0);
  cc.evex().vpslld(ymmA, ymmB, m);
  cc.evex().vpslld(ymmA, m, 0);
  cc.evex().vpslld(zmmA, zmmB, m);
  cc.evex().vpslld(zmmA, m, 0);
  cc.evex().vpslldq(xmmA, m, 0);
  cc.evex().vpslldq(ymmA, m, 0);
  cc.evex().vpslldq(zmmA, m, 0);
  cc.evex().vpsllq(xmmA, xmmB, m);
  cc.evex().vpsllq(xmmA, m, 0);
  cc.evex().vpsllq(ymmA, ymmB, m);
  cc.evex().vpsllq(ymmA, m, 0);
  cc.evex().vpsllq(zmmA, zmmB, m);
  cc.evex().vpsllq(zmmA, m, 0);
  cc.evex().vpsllvd(xmmA, xmmB, m);
  cc.evex().vpsllvd(ymmA, ymmB, m);
  cc.evex().vpsllvd(zmmA, zmmB, m);
  cc.evex().vpsllvq(xmmA, xmmB, m);
  cc.evex().vpsllvq(ymmA, ymmB, m);
  cc.evex().vpsllvq(zmmA, zmmB, m);
  cc.evex().vpsllvw(xmmA, xmmB, m);
  cc.evex().vpsllvw(ymmA, ymmB, m);
  cc.evex().vpsllvw(zmmA, zmmB, m);
  cc.evex().vpsllw(xmmA, xmmB, m);
  cc.evex().vpsllw(xmmA, m, 0);
  cc.evex().vpsllw(ymmA, ymmB, m);
  cc.evex().vpsllw(ymmA, m, 0);
  cc.evex().vpsllw(zmmA, zmmB, m);
  cc.evex().vpsllw(zmmA, m, 0);
  cc.evex().vpsrad(xmmA, xmmB, m);
  cc.evex().vpsrad(xmmA, m, 0);
  cc.evex().vpsrad(ymmA, ymmB, m);
  cc.evex().vpsrad(ymmA, m, 0);
  cc.evex().vpsrad(zmmA, zmmB, m);
  cc.evex().vpsrad(zmmA, m, 0);
  cc.evex().vpsraq(xmmA, xmmB, m);
  cc.evex().vpsraq(xmmA, m, 0);
  cc.evex().vpsraq(ymmA, ymmB, m);
  cc.evex().vpsraq(ymmA, m, 0);
  cc.evex().vpsraq(zmmA, zmmB, m);
  cc.evex().vpsraq(zmmA, m, 0);
  cc.evex().vpsravd(xmmA, xmmB, m);
  cc.evex().vpsravd(ymmA, ymmB, m);
  cc.evex().vpsravd(zmmA, zmmB, m);
  cc.evex().vpsravq(xmmA, xmmB, m);
  cc.evex().vpsravq(ymmA, ymmB, m);
  cc.evex().vpsravq(zmmA, zmmB, m);
  cc.evex().vpsravw(xmmA, xmmB, m);
  cc.evex().vpsravw(ymmA, ymmB, m);
  cc.evex().vpsravw(zmmA, zmmB, m);
  cc.evex().vpsraw(xmmA, xmmB, m);
  cc.evex().vpsraw(xmmA, m, 0);
  cc.evex().vpsraw(ymmA, ymmB, m);
  cc.evex().vpsraw(ymmA, m, 0);
  cc.evex().vpsraw(zmmA, zmmB, m);
  cc.evex().vpsraw(zmmA, m, 0);
  cc.evex().vpsrld(xmmA, xmmB, m);
  cc.evex().vpsrld(xmmA, m, 0);
  cc.evex().vpsrld(ymmA, ymmB, m);
  cc.evex().vpsrld(ymmA, m, 0);
  cc.evex().vpsrld(zmmA, zmmB, m);
  cc.evex().vpsrld(zmmA, m, 0);
  cc.evex().vpsrldq(xmmA, m, 0);
  cc.evex().vpsrldq(ymmA, m, 0);
  cc.evex().vpsrldq(zmmA, m, 0);
  cc.evex().vpsrlq(xmmA, xmmB, m);
  cc.evex().vpsrlq(xmmA, m, 0);
  cc.evex().vpsrlq(ymmA, ymmB, m);
  cc.evex().vpsrlq(ymmA, m, 0);
  cc.evex().vpsrlq(zmmA, zmmB, m);
  cc.evex().vpsrlq(zmmA, m, 0);
  cc.evex().vpsrlvd(xmmA, xmmB, m);
  cc.evex().vpsrlvd(ymmA, ymmB, m);
  cc.evex().vpsrlvd(zmmA, zmmB, m);
  cc.evex().vpsrlvq(xmmA, xmmB, m);
  cc.evex().vpsrlvq(ymmA, ymmB, m);
  cc.evex().vpsrlvq(zmmA, zmmB, m);
  cc.evex().vpsrlvw(xmmA, xmmB, m);
  cc.evex().vpsrlvw(ymmA, ymmB, m);
  cc.evex().vpsrlvw(zmmA, zmmB, m);
  cc.evex().vpsrlw(xmmA, xmmB, m);
  cc.evex().vpsrlw(xmmA, m, 0);
  cc.evex().vpsrlw(ymmA, ymmB, m);
  cc.evex().vpsrlw(ymmA, m, 0);
  cc.evex().vpsrlw(zmmA, zmmB, m);
  cc.evex().vpsrlw(zmmA, m, 0);
  cc.evex().vpsubb(xmmA, xmmB, m);
  cc.evex().vpsubb(ymmA, ymmB, m);
  cc.evex().vpsubb(zmmA, zmmB, m);
  cc.evex().vpsubd(xmmA, xmmB, m);
  cc.evex().vpsubd(ymmA, ymmB, m);
  cc.evex().vpsubd(zmmA, zmmB, m);
  cc.evex().vpsubq(xmmA, xmmB, m);
  cc.evex().vpsubq(ymmA, ymmB, m);
  cc.evex().vpsubq(zmmA, zmmB, m);
  cc.evex().vpsubsb(xmmA, xmmB, m);
  cc.evex().vpsubsb(ymmA, ymmB, m);
  cc.evex().vpsubsb(zmmA, zmmB, m);
  cc.evex().vpsubsw(xmmA, xmmB, m);
  cc.evex().vpsubsw(ymmA, ymmB, m);
  cc.evex().vpsubsw(zmmA, zmmB, m);
  cc.evex().vpsubusb(xmmA, xmmB, m);
  cc.evex().vpsubusb(ymmA, ymmB, m);
  cc.evex().vpsubusb(zmmA, zmmB, m);
  cc.evex().vpsubusw(xmmA, xmmB, m);
  cc.evex().vpsubusw(ymmA, ymmB, m);
  cc.evex().vpsubusw(zmmA, zmmB, m);
  cc.evex().vpsubw(xmmA, xmmB, m);
  cc.evex().vpsubw(ymmA, ymmB, m);
  cc.evex().vpsubw(zmmA, zmmB, m);
  cc.evex().vpternlogd(xmmA, xmmB, m, 0);
  cc.evex().vpternlogd(ymmA, ymmB, m, 0);
  cc.evex().vpternlogd(zmmA, zmmB, m, 0);
  cc.evex().vpternlogq(xmmA, xmmB, m, 0);
  cc.evex().vpternlogq(ymmA, ymmB, m, 0);
  cc.evex().vpternlogq(zmmA, zmmB, m, 0);
  cc.evex().vptestmb(kA, xmmB, m);
  cc.evex().vptestmb(kA, ymmB, m);
  cc.evex().vptestmb(kA, zmmB, m);
  cc.evex().vptestmd(kA, xmmB, m);
  cc.evex().vptestmd(kA, ymmB, m);
  cc.evex().vptestmd(kA, zmmB, m);
  cc.evex().vptestmq(kA, xmmB, m);
  cc.evex().vptestmq(kA, ymmB, m);
  cc.evex().vptestmq(kA, zmmB, m);
  cc.evex().vptestmw(kA, xmmB, m);
  cc.evex().vptestmw(kA, ymmB, m);
  cc.evex().vptestmw(kA, zmmB, m);
  cc.evex().vptestnmb(kA, xmmB, m);
  cc.evex().vptestnmb(kA, ymmB, m);
  cc.evex().vptestnmb(kA, zmmB, m);
  cc.evex().vptestnmd(kA, xmmB, m);
  cc.evex().vptestnmd(kA, ymmB, m);
  cc.evex().vptestnmd(kA, zmmB, m);
  cc.evex().vptestnmq(kA, xmmB, m);
  cc.evex().vptestnmq(kA, ymmB, m);
  cc.evex().vptestnmq(kA, zmmB, m);
  cc.evex().vptestnmw(kA, xmmB, m);
  cc.evex().vptestnmw(kA, ymmB, m);
  cc.evex().vptestnmw(kA, zmmB, m);
  cc.evex().vpunpckhbw(xmmA, xmmB, m);
  cc.evex().vpunpckhbw(ymmA, ymmB, m);
  cc.evex().vpunpckhbw(zmmA, zmmB, m);
  cc.evex().vpunpckhdq(xmmA, xmmB, m);
  cc.evex().vpunpckhdq(ymmA, ymmB, m);
  cc.evex().vpunpckhdq(zmmA, zmmB, m);
  cc.evex().vpunpckhqdq(xmmA, xmmB, m);
  cc.evex().vpunpckhqdq(ymmA, ymmB, m);
  cc.evex().vpunpckhqdq(zmmA, zmmB, m);
  cc.evex().vpunpckhwd(xmmA, xmmB, m);
  cc.evex().vpunpckhwd(ymmA, ymmB, m);
  cc.evex().vpunpckhwd(zmmA, zmmB, m);
  cc.evex().vpunpcklbw(xmmA, xmmB, m);
  cc.evex().vpunpcklbw(ymmA, ymmB, m);
  cc.evex().vpunpcklbw(zmmA, zmmB, m);
  cc.evex().vpunpckldq(xmmA, xmmB, m);
  cc.evex().vpunpckldq(ymmA, ymmB, m);
  cc.evex().vpunpckldq(zmmA, zmmB, m);
  cc.evex().vpunpcklqdq(xmmA, xmmB, m);
  cc.evex().vpunpcklqdq(ymmA, ymmB, m);
  cc.evex().vpunpcklqdq(zmmA, zmmB, m);
  cc.evex().vpunpcklwd(xmmA, xmmB, m);
  cc.evex().vpunpcklwd(ymmA, ymmB, m);
  cc.evex().vpunpcklwd(zmmA, zmmB, m);
  cc.evex().vpxord(xmmA, xmmB, m);
  cc.evex().vpxord(ymmA, ymmB, m);
  cc.evex().vpxord(zmmA, zmmB, m);
  cc.evex().vpxorq(xmmA, xmmB, m);
  cc.evex().vpxorq(ymmA, ymmB, m);
  cc.evex().vpxorq(zmmA, zmmB, m);
  cc.evex().vrangepd(xmmA, xmmB, m, 0);
  cc.evex().vrangepd(ymmA, ymmB, m, 0);
  cc.evex().vrangepd(zmmA, zmmB, m, 0);
  cc.evex().vrangeps(xmmA, xmmB, m, 0);
  cc.evex().vrangeps(ymmA, ymmB, m, 0);
  cc.evex().vrangeps(zmmA, zmmB, m, 0);
  cc.evex().vrangesd(xmmA, xmmB, m, 0);
  cc.evex().vrangess(xmmA, xmmB, m, 0);
  cc.evex().vrcp14pd(xmmA, m);
  cc.evex().vrcp14pd(ymmA, m);
  cc.evex().vrcp14pd(zmmA, m);
  cc.evex().vrcp14ps(xmmA, m);
  cc.evex().vrcp14ps(ymmA, m);
  cc.evex().vrcp14ps(zmmA, m);
  cc.evex().vrcp14sd(xmmA, xmmB, m);
  cc.evex().vrcp14ss(xmmA, xmmB, m);
  cc.evex().vrcp28pd(zmmA, m);
  cc.evex().vrcp28ps(zmmA, m);
  cc.evex().vrcp28sd(xmmA, xmmB, m);
  cc.evex().vrcp28ss(xmmA, xmmB, m);
  cc.evex().vreducepd(xmmA, m, 0);
  cc.evex().vreducepd(ymmA, m, 0);
  cc.evex().vreducepd(zmmA, m, 0);
  cc.evex().vreduceps(xmmA, m, 0);
  cc.evex().vreduceps(ymmA, m, 0);
  cc.evex().vreduceps(zmmA, m, 0);
  cc.evex().vreducesd(xmmA, xmmB, m, 0);
  cc.evex().vreducess(xmmA, xmmB, m, 0);
  cc.evex().vrndscalepd(xmmA, m, 0);
  cc.evex().vrndscalepd(ymmA, m, 0);
  cc.evex().vrndscalepd(zmmA, m, 0);
  cc.evex().vrndscaleps(xmmA, m, 0);
  cc.evex().vrndscaleps(ymmA, m, 0);
  cc.evex().vrndscaleps(zmmA, m, 0);
  cc.evex().vrndscalesd(xmmA, xmmB, m, 0);
  cc.evex().vrndscaless(xmmA, xmmB, m, 0);
  cc.evex().vrsqrt14pd(xmmA, m);
  cc.evex().vrsqrt14pd(ymmA, m);
  cc.evex().vrsqrt14pd(zmmA, m);
  cc.evex().vrsqrt14ps(xmmA, m);
  cc.evex().vrsqrt14ps(ymmA, m);
  cc.evex().vrsqrt14ps(zmmA, m);
  cc.evex().vrsqrt14sd(xmmA, xmmB, m);
  cc.evex().vrsqrt14ss(xmmA, xmmB, m);
  cc.evex().vrsqrt28pd(zmmA, m);
  cc.evex().vrsqrt28ps(zmmA, m);
  cc.evex().vrsqrt28sd(xmmA, xmmB, m);
  cc.evex().vrsqrt28ss(xmmA, xmmB, m);
  cc.evex().vscalefpd(xmmA, xmmB, m);
  cc.evex().vscalefpd(ymmA, ymmB, m);
  cc.evex().vscalefpd(zmmA, zmmB, m);
  cc.evex().vscalefps(xmmA, xmmB, m);
  cc.evex().vscalefps(ymmA, ymmB, m);
  cc.evex().vscalefps(zmmA, zmmB, m);
  cc.evex().vscalefsd(xmmA, xmmB, m);
  cc.evex().vscalefss(xmmA, xmmB, m);
  cc.evex().k(kA).vscatterdpd(vx_ptr, xmmB);
  cc.evex().k(kA).vscatterdpd(vx_ptr, ymmB);
  cc.evex().k(kA).vscatterdpd(vy_ptr, zmmB);
  cc.evex().k(kA).vscatterdps(vx_ptr, xmmB);
  cc.evex().k(kA).vscatterdps(vy_ptr, ymmB);
  cc.evex().k(kA).vscatterdps(vz_ptr, zmmB);
  cc.evex().k(kA).vscatterpf0dpd(vy_ptr);
  cc.evex().k(kA).vscatterpf0dps(vz_ptr);
  cc.evex().k(kA).vscatterpf0qpd(vz_ptr);
  cc.evex().k(kA).vscatterpf0qps(vz_ptr);
  cc.evex().k(kA).vscatterpf1dpd(vy_ptr);
  cc.evex().k(kA).vscatterpf1dps(vz_ptr);
  cc.evex().k(kA).vscatterpf1qpd(vz_ptr);
  cc.evex().k(kA).vscatterpf1qps(vz_ptr);
  cc.evex().k(kA).vscatterqpd(vx_ptr, xmmB);
  cc.evex().k(kA).vscatterqpd(vy_ptr, ymmB);
  cc.evex().k(kA).vscatterqpd(vz_ptr, zmmB);
  cc.evex().k(kA).vscatterqps(vx_ptr, xmmB);
  cc.evex().k(kA).vscatterqps(vy_ptr, xmmB);
  cc.evex().k(kA).vscatterqps(vz_ptr, ymmB);
  cc.evex().vshuff32x4(ymmA, ymmB, m, 0);
  cc.evex().vshuff32x4(zmmA, zmmB, m, 0);
  cc.evex().vshuff64x2(ymmA, ymmB, m, 0);
  cc.evex().vshuff64x2(zmmA, zmmB, m, 0);
  cc.evex().vshufi32x4(ymmA, ymmB, m, 0);
  cc.evex().vshufi32x4(zmmA, zmmB, m, 0);
  cc.evex().vshufi64x2(ymmA, ymmB, m, 0);
  cc.evex().vshufi64x2(zmmA, zmmB, m, 0);
  cc.evex().vshufpd(xmmA, xmmB, m, 0);
  cc.evex().vshufpd(ymmA, ymmB, m, 0);
  cc.evex().vshufpd(zmmA, zmmB, m, 0);
  cc.evex().vshufps(xmmA, xmmB, m, 0);
  cc.evex().vshufps(ymmA, ymmB, m, 0);
  cc.evex().vshufps(zmmA, zmmB, m, 0);
  cc.evex().vsqrtpd(xmmA, m);
  cc.evex().vsqrtpd(ymmA, m);
  cc.evex().vsqrtpd(zmmA, m);
  cc.evex().vsqrtps(xmmA, m);
  cc.evex().vsqrtps(ymmA, m);
  cc.evex().vsqrtps(zmmA, m);
  cc.evex().vsqrtsd(xmmA, xmmB, m);
  cc.evex().vsqrtss(xmmA, xmmB, m);
  cc.evex().vsubpd(xmmA, xmmB, m);
  cc.evex().vsubpd(ymmA, ymmB, m);
  cc.evex().vsubpd(zmmA, zmmB, m);
  cc.evex().vsubps(xmmA, xmmB, m);
  cc.evex().vsubps(ymmA, ymmB, m);
  cc.evex().vsubps(zmmA, zmmB, m);
  cc.evex().vsubsd(xmmA, xmmB, m);
  cc.evex().vsubss(xmmA, xmmB, m);
  cc.evex().vucomisd(xmmA, m);
  cc.evex().vucomiss(xmmA, m);
  cc.evex().vunpckhpd(xmmA, xmmB, m);
  cc.evex().vunpckhpd(ymmA, ymmB, m);
  cc.evex().vunpckhpd(zmmA, zmmB, m);
  cc.evex().vunpckhps(xmmA, xmmB, m);
  cc.evex().vunpckhps(ymmA, ymmB, m);
  cc.evex().vunpckhps(zmmA, zmmB, m);
  cc.evex().vunpcklpd(xmmA, xmmB, m);
  cc.evex().vunpcklpd(ymmA, ymmB, m);
  cc.evex().vunpcklpd(zmmA, zmmB, m);
  cc.evex().vunpcklps(xmmA, xmmB, m);
  cc.evex().vunpcklps(ymmA, ymmB, m);
  cc.evex().vunpcklps(zmmA, zmmB, m);
  cc.evex().vxorpd(xmmA, xmmB, m);
  cc.evex().vxorpd(ymmA, ymmB, m);
  cc.evex().vxorpd(zmmA, zmmB, m);
  cc.evex().vxorps(xmmA, xmmB, m);
  cc.evex().vxorps(ymmA, ymmB, m);
  cc.evex().vxorps(zmmA, zmmB, m);
}

// Generates a long sequence of AVX512 instructions.
template<typename Emitter>
static void generateAvx512SequenceInternal(
  Emitter& cc,
  InstForm form,
  const x86::Gp& gp,
  const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {

  if (form == InstForm::kReg)
    generateAvx512SequenceInternalRegOnly(cc, gp, kA, kB, kC, vecA, vecB, vecC, vecD);
  else
    generateAvx512SequenceInternalRegMem(cc, gp, kA, kB, kC, vecA, vecB, vecC, vecD);
}

static void generateAvx512Sequence(BaseEmitter& emitter, InstForm form, bool emitPrologEpilog) {
  using namespace asmjit::x86;

  if (emitter.isAssembler()) {
    Assembler& cc = *emitter.as<Assembler>();

    if (emitPrologEpilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.addDirtyRegs(eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
      frame.finalize();

      cc.emitProlog(frame);
      generateAvx512SequenceInternal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
      cc.emitEpilog(frame);
    }
    else {
      generateAvx512SequenceInternal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
    }
  }
#ifndef ASMJIT_NO_BUILDER
  else if (emitter.isBuilder()) {
    Builder& cc = *emitter.as<Builder>();

    if (emitPrologEpilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.addDirtyRegs(eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
      frame.finalize();

      cc.emitProlog(frame);
      generateAvx512SequenceInternal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
      cc.emitEpilog(frame);
    }
    else {
      generateAvx512SequenceInternal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
    }
  }
#endif
#ifndef ASMJIT_NO_COMPILER
  else if (emitter.isCompiler()) {
    Compiler& cc = *emitter.as<Compiler>();

    Gp gp = cc.newGpz("gp");
    Zmm vecA = cc.newZmm("vecA");
    Zmm vecB = cc.newZmm("vecB");
    Zmm vecC = cc.newZmm("vecC");
    Zmm vecD = cc.newZmm("vecD");

    KReg kA = cc.newKq("kA");
    KReg kB = cc.newKq("kB");
    KReg kC = cc.newKq("kC");

    cc.addFunc(FuncSignature::build<void>());
    generateAvx512SequenceInternal(cc, form, gp, kA, kB, kC, vecA, vecB, vecC, vecD);
    cc.endFunc();
  }
#endif
}

template<typename EmitterFn>
static void benchmarkX86Function(Arch arch, uint32_t numIterations, const char* description, const EmitterFn& emitterFn) noexcept {
  CodeHolder code;
  printf("%s:\n", description);

  uint32_t instCount = 0;

#ifndef ASMJIT_NO_BUILDER
  instCount = asmjit_perf_utils::calculateInstructionCount<x86::Builder>(code, arch, [&](x86::Builder& cc) {
    emitterFn(cc, false);
  });
#endif

  asmjit_perf_utils::bench<x86::Assembler>(code, arch, numIterations, "[raw]", instCount, [&](x86::Assembler& cc) {
    emitterFn(cc, false);
  });

  asmjit_perf_utils::bench<x86::Assembler>(code, arch, numIterations, "[validated]", instCount, [&](x86::Assembler& cc) {
    cc.addDiagnosticOptions(DiagnosticOptions::kValidateAssembler);
    emitterFn(cc, false);
  });

  asmjit_perf_utils::bench<x86::Assembler>(code, arch, numIterations, "[prolog/epilog]", instCount, [&](x86::Assembler& cc) {
    emitterFn(cc, true);
  });

#ifndef ASMJIT_NO_BUILDER
  asmjit_perf_utils::bench<x86::Builder>(code, arch, numIterations, "[no-asm]", instCount, [&](x86::Builder& cc) {
    emitterFn(cc, false);
  });

  asmjit_perf_utils::bench<x86::Builder>(code, arch, numIterations, "[finalized]", instCount, [&](x86::Builder& cc) {
    emitterFn(cc, false);
    cc.finalize();
  });

  asmjit_perf_utils::bench<x86::Builder>(code, arch, numIterations, "[prolog/epilog]", instCount, [&](x86::Builder& cc) {
    emitterFn(cc, true);
    cc.finalize();
  });
#endif

#ifndef ASMJIT_NO_COMPILER
  asmjit_perf_utils::bench<x86::Compiler>(code, arch, numIterations, "[no-asm]", instCount, [&](x86::Compiler& cc) {
    emitterFn(cc, true);
  });

  asmjit_perf_utils::bench<x86::Compiler>(code, arch, numIterations, "[finalized]", instCount, [&](x86::Compiler& cc) {
    emitterFn(cc, true);
    cc.finalize();
  });
#endif

  printf("\n");
}

void benchmarkX86Emitters(uint32_t numIterations, bool testX86, bool testX64) {
  uint32_t i = 0;
  uint32_t n = 0;

  Arch archs[2] {};

  if (testX86) archs[n++] = Arch::kX86;
  if (testX64) archs[n++] = Arch::kX64;

  for (i = 0; i < n; i++) {
    static const char description[] = "GpSequence<Reg> (Sequence of GP instructions - reg-only)";
    benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
      generateGpSequence(emitter, InstForm::kReg, emitPrologEpilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "GpSequence<Mem> (Sequence of GP instructions - reg/mem)";
    benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
      generateGpSequence(emitter, InstForm::kMem, emitPrologEpilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "SseSequence<Reg> (sequence of SSE+ instructions - reg-only)";
    benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
      generateSseSequence(emitter, InstForm::kReg, emitPrologEpilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "SseSequence<Mem> (sequence of SSE+ instructions - reg/mem)";
    benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
      generateSseSequence(emitter, InstForm::kMem, emitPrologEpilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "AvxSequence<Reg> (sequence of AVX+ instructions - reg-only)";
    benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
      generateAvxSequence(emitter, InstForm::kReg, emitPrologEpilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "AvxSequence<Mem> (sequence of AVX+ instructions - reg/mem)";
    benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
      generateAvxSequence(emitter, InstForm::kMem, emitPrologEpilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "Avx512Sequence<Reg> (sequence of AVX512+ instructions - reg-only)";
    benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
      generateAvx512Sequence(emitter, InstForm::kReg, emitPrologEpilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "Avx512Sequence<Mem> (sequence of AVX512+ instructions - reg/mem)";
    benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
      generateAvx512Sequence(emitter, InstForm::kMem, emitPrologEpilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "SseAlphaBlend (alpha-blend function with labels and jumps)";
    benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
      asmtest::generateSseAlphaBlend(emitter, emitPrologEpilog);
    });
  }
}

#endif // !ASMJIT_NO_X86
